LLVM 22.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86FrameLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86IntrinsicsInfo.h"
21#include "X86TargetMachine.h"
23#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
51#include "llvm/IR/IRBuilder.h"
53#include "llvm/IR/Intrinsics.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
60#include "llvm/Support/Debug.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE "x86-isel"
72
74 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
76 "Sets the preferable loop alignment for experiments (as log2 bytes) "
77 "for innermost loops only. If specified, this option overrides "
78 "alignment set by x86-experimental-pref-loop-alignment."),
80
82 "x86-br-merging-base-cost", cl::init(2),
84 "Sets the cost threshold for when multiple conditionals will be merged "
85 "into one branch versus be split in multiple branches. Merging "
86 "conditionals saves branches at the cost of additional instructions. "
87 "This value sets the instruction cost limit, below which conditionals "
88 "will be merged, and above which conditionals will be split. Set to -1 "
89 "to never merge branches."),
91
93 "x86-br-merging-ccmp-bias", cl::init(6),
94 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
95 "supports conditional compare instructions."),
97
98static cl::opt<bool>
99 WidenShift("x86-widen-shift", cl::init(true),
100 cl::desc("Replace narrow shifts with wider shifts."),
101 cl::Hidden);
102
104 "x86-br-merging-likely-bias", cl::init(0),
105 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
106 "that all conditionals will be executed. For example for merging "
107 "the conditionals (a == b && c > d), if its known that a == b is "
108 "likely, then it is likely that if the conditionals are split "
109 "both sides will be executed, so it may be desirable to increase "
110 "the instruction cost threshold. Set to -1 to never merge likely "
111 "branches."),
112 cl::Hidden);
113
115 "x86-br-merging-unlikely-bias", cl::init(-1),
116 cl::desc(
117 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
118 "that all conditionals will be executed. For example for merging "
119 "the conditionals (a == b && c > d), if its known that a == b is "
120 "unlikely, then it is unlikely that if the conditionals are split "
121 "both sides will be executed, so it may be desirable to decrease "
122 "the instruction cost threshold. Set to -1 to never merge unlikely "
123 "branches."),
124 cl::Hidden);
125
127 "mul-constant-optimization", cl::init(true),
128 cl::desc("Replace 'mul x, Const' with more effective instructions like "
129 "SHIFT, LEA, etc."),
130 cl::Hidden);
131
133 const X86Subtarget &STI)
134 : TargetLowering(TM), Subtarget(STI) {
135 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
136 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
137
138 // Set up the TargetLowering object.
139
140 // X86 is weird. It always uses i8 for shift amounts and setcc results.
142 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
144
145 // X86 instruction cache is coherent with its data cache so we can use the
146 // default expansion to a no-op.
148
149 // For 64-bit, since we have so many registers, use the ILP scheduler.
150 // For 32-bit, use the register pressure specific scheduling.
151 // For Atom, always use ILP scheduling.
152 if (Subtarget.isAtom())
154 else if (Subtarget.is64Bit())
156 else
158 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
159 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
160
161 // Bypass expensive divides and use cheaper ones.
162 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
163 if (Subtarget.hasSlowDivide32())
164 addBypassSlowDiv(32, 8);
165 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
166 addBypassSlowDiv(64, 32);
167 }
168
169 if (Subtarget.canUseCMPXCHG16B())
171 else if (Subtarget.canUseCMPXCHG8B())
173 else
175
176 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
177
179
180 // Set up the register classes.
181 addRegisterClass(MVT::i8, &X86::GR8RegClass);
182 addRegisterClass(MVT::i16, &X86::GR16RegClass);
183 addRegisterClass(MVT::i32, &X86::GR32RegClass);
184 if (Subtarget.is64Bit())
185 addRegisterClass(MVT::i64, &X86::GR64RegClass);
186
187 for (MVT VT : MVT::integer_valuetypes())
189
190 // We don't accept any truncstore of integer registers.
191 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
192 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
193 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
194 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
195 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
196 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
197
198 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
199
200 // SETOEQ and SETUNE require checking two conditions.
201 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
204 }
205
206 // Integer absolute.
207 if (Subtarget.canUseCMOV()) {
208 setOperationAction(ISD::ABS , MVT::i16 , Custom);
209 setOperationAction(ISD::ABS , MVT::i32 , Custom);
210 if (Subtarget.is64Bit())
211 setOperationAction(ISD::ABS , MVT::i64 , Custom);
212 }
213
214 // Absolute difference.
215 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
216 setOperationAction(Op , MVT::i8 , Custom);
217 setOperationAction(Op , MVT::i16 , Custom);
218 setOperationAction(Op , MVT::i32 , Custom);
219 if (Subtarget.is64Bit())
220 setOperationAction(Op , MVT::i64 , Custom);
221 }
222
223 // Signed saturation subtraction.
227 if (Subtarget.is64Bit())
229
230 // Funnel shifts.
231 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
232 // For slow shld targets we only lower for code size.
233 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
234
235 setOperationAction(ShiftOp , MVT::i8 , Custom);
236 setOperationAction(ShiftOp , MVT::i16 , Custom);
237 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
238 if (Subtarget.is64Bit())
239 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
240 }
241
242 if (!Subtarget.useSoftFloat()) {
243 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
244 // operation.
249 // We have an algorithm for SSE2, and we turn this into a 64-bit
250 // FILD or VCVTUSI2SS/SD for other targets.
253 // We have an algorithm for SSE2->double, and we turn this into a
254 // 64-bit FILD followed by conditional FADD for other targets.
257
258 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
259 // this operation.
262 // SSE has no i16 to fp conversion, only i32. We promote in the handler
263 // to allow f80 to use i16 and f64 to use i16 with sse1 only
266 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
269 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
270 // are Legal, f80 is custom lowered.
273
274 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
275 // this operation.
277 // FIXME: This doesn't generate invalid exception when it should. PR44019.
283 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
284 // are Legal, f80 is custom lowered.
287
288 // Handle FP_TO_UINT by promoting the destination to a larger signed
289 // conversion.
291 // FIXME: This doesn't generate invalid exception when it should. PR44019.
294 // FIXME: This doesn't generate invalid exception when it should. PR44019.
300
301 setOperationAction(ISD::LRINT, MVT::f32, Custom);
302 setOperationAction(ISD::LRINT, MVT::f64, Custom);
303 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
304 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
305
306 if (!Subtarget.is64Bit()) {
307 setOperationAction(ISD::LRINT, MVT::i64, Custom);
308 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
309 }
310 }
311
312 if (Subtarget.hasSSE2()) {
313 // Custom lowering for saturating float to int conversions.
314 // We handle promotion to larger result types manually.
315 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
318 }
321 if (Subtarget.is64Bit()) {
324 }
325 }
326 if (Subtarget.hasAVX10_2()) {
331 for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
332 MVT::v4i64}) {
335 }
336 if (Subtarget.is64Bit()) {
339 }
340 }
341
342 // Handle address space casts between mixed sized pointers.
343 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
344 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
345
346 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
347 if (!Subtarget.hasSSE2()) {
348 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
349 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
352 if (Subtarget.is64Bit()) {
353 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
354 // Without SSE, i64->f64 goes through memory.
355 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
356 }
357 } else if (!Subtarget.is64Bit())
358 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
359
360 // Scalar integer divide and remainder are lowered to use operations that
361 // produce two results, to match the available instructions. This exposes
362 // the two-result form to trivial CSE, which is able to combine x/y and x%y
363 // into a single instruction.
364 //
365 // Scalar integer multiply-high is also lowered to use two-result
366 // operations, to match the available instructions. However, plain multiply
367 // (low) operations are left as Legal, as there are single-result
368 // instructions for this in x86. Using the two-result multiply instructions
369 // when both high and low results are needed must be arranged by dagcombine.
370 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
377 }
378
379 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
380 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
381 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
382 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
383 setOperationAction(ISD::BR_CC, VT, Expand);
385 }
386 if (Subtarget.is64Bit())
391
392 setOperationAction(ISD::FREM , MVT::f32 , Expand);
393 setOperationAction(ISD::FREM , MVT::f64 , Expand);
394 setOperationAction(ISD::FREM , MVT::f80 , Expand);
395 setOperationAction(ISD::FREM , MVT::f128 , Expand);
396
397 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
399 setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
400 setOperationAction(ISD::GET_FPENV_MEM , MVT::Other, Custom);
401 setOperationAction(ISD::SET_FPENV_MEM , MVT::Other, Custom);
402 setOperationAction(ISD::RESET_FPENV , MVT::Other, Custom);
403 }
404
405 // Promote the i8 variants and force them on up to i32 which has a shorter
406 // encoding.
407 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
409 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
410 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
411 // promote that too.
412 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
414
415 if (!Subtarget.hasBMI()) {
416 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
418 if (Subtarget.is64Bit()) {
419 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
421 }
422 }
423
424 if (Subtarget.hasLZCNT()) {
425 // When promoting the i8 variants, force them to i32 for a shorter
426 // encoding.
427 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
429 } else {
430 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
431 if (VT == MVT::i64 && !Subtarget.is64Bit())
432 continue;
435 }
436 }
437
438 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
439 ISD::STRICT_FP_TO_FP16}) {
440 // Special handling for half-precision floating point conversions.
441 // If we don't have F16C support, then lower half float conversions
442 // into library calls.
444 Op, MVT::f32,
445 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
446 // There's never any support for operations beyond MVT::f32.
447 setOperationAction(Op, MVT::f64, Expand);
448 setOperationAction(Op, MVT::f80, Expand);
449 setOperationAction(Op, MVT::f128, Expand);
450 }
451
452 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
453 setOperationAction(ISD::STRICT_FP_TO_BF16, VT, Expand);
454 setOperationAction(ISD::STRICT_BF16_TO_FP, VT, Expand);
455 }
456
457 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
458 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
459 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
460 setTruncStoreAction(VT, MVT::f16, Expand);
461 setTruncStoreAction(VT, MVT::bf16, Expand);
462
463 setOperationAction(ISD::BF16_TO_FP, VT, Expand);
464 setOperationAction(ISD::FP_TO_BF16, VT, Custom);
465 }
466
470 if (Subtarget.is64Bit())
472 if (Subtarget.hasPOPCNT()) {
473 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
474 // popcntw is longer to encode than popcntl and also has a false dependency
475 // on the dest that popcntl hasn't had since Cannon Lake.
476 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
477 } else {
482 }
483
484 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
485
486 if (!Subtarget.hasMOVBE())
488
489 // X86 wants to expand cmov itself.
490 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
495 }
496 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
497 if (VT == MVT::i64 && !Subtarget.is64Bit())
498 continue;
501 }
502
503 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
506
508 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
509 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
513
514 // Darwin ABI issue.
515 for (auto VT : { MVT::i32, MVT::i64 }) {
516 if (VT == MVT::i64 && !Subtarget.is64Bit())
517 continue;
524 }
525
526 // 64-bit shl, sra, srl (iff 32-bit x86)
527 for (auto VT : { MVT::i32, MVT::i64 }) {
528 if (VT == MVT::i64 && !Subtarget.is64Bit())
529 continue;
533 }
534
535 if (Subtarget.hasSSEPrefetch())
536 setOperationAction(ISD::PREFETCH , MVT::Other, Custom);
537
538 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
539
540 // Expand certain atomics
541 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
542 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
543 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
544 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
545 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
546 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
547 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
548 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
549 }
550
551 if (!Subtarget.is64Bit())
552 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
553
554 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
555 // All CPUs supporting AVX will atomically load/store aligned 128-bit
556 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
557 setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
558 setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
559 }
560
561 if (Subtarget.canUseCMPXCHG16B())
562 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
563
564 // FIXME - use subtarget debug flags
565 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
566 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
567 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
568 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
569 }
570
573
574 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
575 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
576
577 setOperationAction(ISD::TRAP, MVT::Other, Legal);
578 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
579 if (Subtarget.isTargetPS())
580 setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
581 else
582 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
583
584 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
585 setOperationAction(ISD::VASTART , MVT::Other, Custom);
586 setOperationAction(ISD::VAEND , MVT::Other, Expand);
587 bool Is64Bit = Subtarget.is64Bit();
588 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
589 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
590
591 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
592 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
593
594 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
595
596 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
597 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
598 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
599
601
602 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
603 setOperationAction(ISD::FABS, VT, Action);
604 setOperationAction(ISD::FNEG, VT, Action);
606 setOperationAction(ISD::FREM, VT, Action);
607 setOperationAction(ISD::FMA, VT, Action);
608 setOperationAction(ISD::FMINNUM, VT, Action);
609 setOperationAction(ISD::FMAXNUM, VT, Action);
610 setOperationAction(ISD::FMINIMUM, VT, Action);
611 setOperationAction(ISD::FMAXIMUM, VT, Action);
612 setOperationAction(ISD::FMINIMUMNUM, VT, Action);
613 setOperationAction(ISD::FMAXIMUMNUM, VT, Action);
614 setOperationAction(ISD::FSIN, VT, Action);
615 setOperationAction(ISD::FCOS, VT, Action);
616 setOperationAction(ISD::FSINCOS, VT, Action);
617 setOperationAction(ISD::FTAN, VT, Action);
618 setOperationAction(ISD::FSQRT, VT, Action);
619 setOperationAction(ISD::FPOW, VT, Action);
620 setOperationAction(ISD::FPOWI, VT, Action);
621 setOperationAction(ISD::FLOG, VT, Action);
622 setOperationAction(ISD::FLOG2, VT, Action);
623 setOperationAction(ISD::FLOG10, VT, Action);
624 setOperationAction(ISD::FEXP, VT, Action);
625 setOperationAction(ISD::FEXP2, VT, Action);
626 setOperationAction(ISD::FEXP10, VT, Action);
627 setOperationAction(ISD::FCEIL, VT, Action);
628 setOperationAction(ISD::FFLOOR, VT, Action);
629 setOperationAction(ISD::FNEARBYINT, VT, Action);
630 setOperationAction(ISD::FRINT, VT, Action);
631 setOperationAction(ISD::BR_CC, VT, Action);
632 setOperationAction(ISD::SETCC, VT, Action);
635 setOperationAction(ISD::FROUND, VT, Action);
636 setOperationAction(ISD::FROUNDEVEN, VT, Action);
637 setOperationAction(ISD::FTRUNC, VT, Action);
638 setOperationAction(ISD::FLDEXP, VT, Action);
639 };
640
641 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
642 // f16, f32 and f64 use SSE.
643 // Set up the FP register classes.
644 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
645 : &X86::FR16RegClass);
646 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
647 : &X86::FR32RegClass);
648 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
649 : &X86::FR64RegClass);
650
651 // Disable f32->f64 extload as we can only generate this in one instruction
652 // under optsize. So its easier to pattern match (fpext (load)) for that
653 // case instead of needing to emit 2 instructions for extload in the
654 // non-optsize case.
655 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
656
657 for (auto VT : { MVT::f32, MVT::f64 }) {
658 // Use ANDPD to simulate FABS.
659 setOperationAction(ISD::FABS, VT, Custom);
660
661 // Use XORP to simulate FNEG.
662 setOperationAction(ISD::FNEG, VT, Custom);
663
664 // Use ANDPD and ORPD to simulate FCOPYSIGN.
666
667 // These might be better off as horizontal vector ops.
670
671 // We don't support sin/cos/fmod
672 setOperationAction(ISD::FSIN , VT, Expand);
673 setOperationAction(ISD::FCOS , VT, Expand);
674 setOperationAction(ISD::FSINCOS, VT, Expand);
675 }
676
677 // Half type will be promoted by default.
678 setF16Action(MVT::f16, Promote);
683 setOperationAction(ISD::FABS, MVT::f16, Custom);
684 setOperationAction(ISD::FNEG, MVT::f16, Custom);
687 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
688 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
689
720 setOperationAction(ISD::LRINT, MVT::f16, Expand);
721 setOperationAction(ISD::LLRINT, MVT::f16, Expand);
722
723 // Lower this to MOVMSK plus an AND.
726
727 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
728 (UseX87 || Is64Bit)) {
729 // Use SSE for f32, x87 for f64.
730 // Set up the FP register classes.
731 addRegisterClass(MVT::f32, &X86::FR32RegClass);
732 if (UseX87)
733 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
734
735 // Use ANDPS to simulate FABS.
736 setOperationAction(ISD::FABS , MVT::f32, Custom);
737
738 // Use XORP to simulate FNEG.
739 setOperationAction(ISD::FNEG , MVT::f32, Custom);
740
741 if (UseX87)
743
744 // Use ANDPS and ORPS to simulate FCOPYSIGN.
745 if (UseX87)
748
749 // We don't support sin/cos/fmod
750 setOperationAction(ISD::FSIN , MVT::f32, Expand);
751 setOperationAction(ISD::FCOS , MVT::f32, Expand);
752 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
753
754 if (UseX87) {
755 // Always expand sin/cos functions even though x87 has an instruction.
756 setOperationAction(ISD::FSIN, MVT::f64, Expand);
757 setOperationAction(ISD::FCOS, MVT::f64, Expand);
758 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
759 }
760 } else if (UseX87) {
761 // f32 and f64 in x87.
762 // Set up the FP register classes.
763 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
764 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
765
766 for (auto VT : { MVT::f32, MVT::f64 }) {
769
770 // Always expand sin/cos functions even though x87 has an instruction.
771 setOperationAction(ISD::FSIN , VT, Expand);
772 setOperationAction(ISD::FCOS , VT, Expand);
773 setOperationAction(ISD::FSINCOS, VT, Expand);
774 }
775 }
776
777 // Expand FP32 immediates into loads from the stack, save special cases.
778 if (isTypeLegal(MVT::f32)) {
779 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
780 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
781 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
782 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
783 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
784 } else // SSE immediates.
785 addLegalFPImmediate(APFloat(+0.0f)); // xorps
786 }
787 // Expand FP64 immediates into loads from the stack, save special cases.
788 if (isTypeLegal(MVT::f64)) {
789 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
790 addLegalFPImmediate(APFloat(+0.0)); // FLD0
791 addLegalFPImmediate(APFloat(+1.0)); // FLD1
792 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
793 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
794 } else // SSE immediates.
795 addLegalFPImmediate(APFloat(+0.0)); // xorpd
796 }
797 // Support fp16 0 immediate.
798 if (isTypeLegal(MVT::f16))
799 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
800
801 // Handle constrained floating-point operations of scalar.
814
815 // We don't support FMA.
818
819 // f80 always uses X87.
820 if (UseX87) {
821 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
824 {
826 addLegalFPImmediate(TmpFlt); // FLD0
827 TmpFlt.changeSign();
828 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
829
830 bool ignored;
831 APFloat TmpFlt2(+1.0);
833 &ignored);
834 addLegalFPImmediate(TmpFlt2); // FLD1
835 TmpFlt2.changeSign();
836 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
837 }
838
839 // Always expand sin/cos functions even though x87 has an instruction.
840 // clang-format off
841 setOperationAction(ISD::FSIN , MVT::f80, Expand);
842 setOperationAction(ISD::FCOS , MVT::f80, Expand);
843 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
844 setOperationAction(ISD::FTAN , MVT::f80, Expand);
845 setOperationAction(ISD::FASIN , MVT::f80, Expand);
846 setOperationAction(ISD::FACOS , MVT::f80, Expand);
847 setOperationAction(ISD::FATAN , MVT::f80, Expand);
848 setOperationAction(ISD::FATAN2 , MVT::f80, Expand);
849 setOperationAction(ISD::FSINH , MVT::f80, Expand);
850 setOperationAction(ISD::FCOSH , MVT::f80, Expand);
851 setOperationAction(ISD::FTANH , MVT::f80, Expand);
852 // clang-format on
853
854 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
855 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
856 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
857 setOperationAction(ISD::FRINT, MVT::f80, Expand);
858 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
859 setOperationAction(ISD::FROUNDEVEN, MVT::f80, Expand);
861 setOperationAction(ISD::LROUND, MVT::f80, LibCall);
862 setOperationAction(ISD::LLROUND, MVT::f80, LibCall);
863 setOperationAction(ISD::LRINT, MVT::f80, Custom);
864 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
865
866 // Handle constrained floating-point operations of scalar.
873 if (isTypeLegal(MVT::f16)) {
874 setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
876 } else {
878 }
879 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
880 // as Custom.
882 }
883
884 // f128 uses xmm registers, but most operations require libcalls.
885 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
886 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
887 : &X86::VR128RegClass);
888
889 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
890
901
902 setOperationAction(ISD::FABS, MVT::f128, Custom);
903 setOperationAction(ISD::FNEG, MVT::f128, Custom);
905
906 // clang-format off
907 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
909 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
911 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
912 setOperationAction(ISD::FTAN, MVT::f128, LibCall);
914 // clang-format on
915 // No STRICT_FSINCOS
916 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
918
919 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
921 // We need to custom handle any FP_ROUND with an f128 input, but
922 // LegalizeDAG uses the result type to know when to run a custom handler.
923 // So we have to list all legal floating point result types here.
924 if (isTypeLegal(MVT::f32)) {
927 }
928 if (isTypeLegal(MVT::f64)) {
931 }
932 if (isTypeLegal(MVT::f80)) {
936 }
937
939
940 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
941 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
942 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
943 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
944 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
945 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
946 }
947
948 // Always use a library call for pow.
949 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
950 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
951 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
952 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
953
954 setOperationAction(ISD::FLOG, MVT::f80, Expand);
955 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
956 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
957 setOperationAction(ISD::FEXP, MVT::f80, Expand);
958 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
959 setOperationAction(ISD::FEXP10, MVT::f80, Expand);
960 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
961 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
962
963 // Some FP actions are always expanded for vector types.
964 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
965 MVT::v4f32, MVT::v8f32, MVT::v16f32,
966 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
967 // clang-format off
968 setOperationAction(ISD::FSIN, VT, Expand);
969 setOperationAction(ISD::FSINCOS, VT, Expand);
970 setOperationAction(ISD::FCOS, VT, Expand);
971 setOperationAction(ISD::FTAN, VT, Expand);
974 setOperationAction(ISD::FPOW, VT, Expand);
975 setOperationAction(ISD::FLOG, VT, Expand);
976 setOperationAction(ISD::FLOG2, VT, Expand);
977 setOperationAction(ISD::FLOG10, VT, Expand);
978 setOperationAction(ISD::FEXP, VT, Expand);
979 setOperationAction(ISD::FEXP2, VT, Expand);
980 setOperationAction(ISD::FEXP10, VT, Expand);
981 // clang-format on
982 }
983
984 // First set operation action for all vector types to either promote
985 // (for widening) or expand (for scalarization). Then we will selectively
986 // turn on ones that can be effectively codegen'd.
997 setOperationAction(ISD::FFLOOR, VT, Expand);
998 setOperationAction(ISD::FCEIL, VT, Expand);
999 setOperationAction(ISD::FTRUNC, VT, Expand);
1000 setOperationAction(ISD::FRINT, VT, Expand);
1001 setOperationAction(ISD::FNEARBYINT, VT, Expand);
1002 setOperationAction(ISD::FROUNDEVEN, VT, Expand);
1026 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1027 setTruncStoreAction(InnerVT, VT, Expand);
1028
1029 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1030 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1031
1032 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1033 // types, we have to deal with them whether we ask for Expansion or not.
1034 // Setting Expand causes its own optimisation problems though, so leave
1035 // them legal.
1036 if (VT.getVectorElementType() == MVT::i1)
1037 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1038
1039 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1040 // split/scalarized right now.
1041 if (VT.getVectorElementType() == MVT::f16 ||
1042 VT.getVectorElementType() == MVT::bf16)
1043 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1044 }
1045 }
1046
1047 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1048 // with -msoft-float, disable use of MMX as well.
1049 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1050 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1051 // No operations on x86mmx supported, everything uses intrinsics.
1052 }
1053
1054 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1055 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1056 : &X86::VR128RegClass);
1057
1058 setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom);
1059 setOperationAction(ISD::FMINIMUM, MVT::f32, Custom);
1060 setOperationAction(ISD::FMAXIMUMNUM, MVT::f32, Custom);
1061 setOperationAction(ISD::FMINIMUMNUM, MVT::f32, Custom);
1062
1063 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1064 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1072
1073 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1074 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1076
1082 }
1083
1084 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1085 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1086 : &X86::VR128RegClass);
1087
1088 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1089 // registers cannot be used even for integer operations.
1090 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1091 : &X86::VR128RegClass);
1092 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1093 : &X86::VR128RegClass);
1094 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1095 : &X86::VR128RegClass);
1096 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1097 : &X86::VR128RegClass);
1098 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1099 : &X86::VR128RegClass);
1100
1101 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1102 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1103 setOperationAction(ISD::FMINIMUM, VT, Custom);
1104 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1105 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1106 }
1107
1108 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1109 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1114 }
1115
1116 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1117 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1118 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1119
1120 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1121 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1122 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1123 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1124 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1125 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1126 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1127 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1128 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1129 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1132
1133 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1134 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1135 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1136
1137 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1139 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1141
1142 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1143 setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
1144
1145 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1146 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1147 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1148 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1149 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1150 }
1151
1162
1167
1168 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1174
1175 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1176 // setcc all the way to isel and prefer SETGT in some isel patterns.
1179 }
1180
1181 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1182 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1187
1188 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1194 }
1195
1196 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1200
1201 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1202 continue;
1203
1206 }
1207 setF16Action(MVT::v8f16, Expand);
1208 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1209 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1210 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1211 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1212 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1213 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1215
1216 // Custom lower v2i64 and v2f64 selects.
1223
1230
1231 // Custom legalize these to avoid over promotion or custom promotion.
1232 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1237 }
1238
1243
1246
1249
1250 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1255
1256 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1260
1261 // We want to legalize this to an f64 load rather than an i64 load on
1262 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1263 // store.
1264 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1265 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1266 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1267 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1268 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1269 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1270
1271 // Add 32-bit vector stores to help vectorization opportunities.
1272 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1273 setOperationAction(ISD::STORE, MVT::v4i8, Custom);
1274
1275 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1276 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1277 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1278 if (!Subtarget.hasAVX512())
1279 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1280
1284
1286
1303
1304 // In the customized shift lowering, the legal v4i32/v2i64 cases
1305 // in AVX2 will be recognized.
1306 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1310 if (VT == MVT::v2i64) continue;
1315 }
1316
1322 }
1323
1324 if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
1329
1330 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1332 }
1333 }
1334
1335 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1336 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1337 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1338 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1339
1340 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1343 }
1344
1345 // These might be better off as horizontal vector ops.
1350 }
1351
1352 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1353 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1354 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1356 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1358 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1360 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1362 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1364 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1366
1367 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1368 }
1369
1370 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1371 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1372 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1373 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1374 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1375 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1376 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1377 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1378
1382
1383 // FIXME: Do we need to handle scalar-to-vector here?
1384 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1385 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1386
1387 // We directly match byte blends in the backend as they match the VSELECT
1388 // condition form.
1390
1391 // SSE41 brings specific instructions for doing vector sign extend even in
1392 // cases where we don't have SRA.
1393 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1396 }
1397
1398 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1399 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1400 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1401 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1402 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1403 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1404 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1405 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1406 }
1407
1408 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1409 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1410 // do the pre and post work in the vector domain.
1413 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1414 // so that DAG combine doesn't try to turn it into uint_to_fp.
1417 }
1418 }
1419
1420 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1422 }
1423
1424 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1425 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1426 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1429 }
1430
1431 // XOP can efficiently perform BITREVERSE with VPPERM.
1432 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1434 }
1435
1436 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1437 bool HasInt256 = Subtarget.hasInt256();
1438
1439 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1440 : &X86::VR256RegClass);
1441 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1442 : &X86::VR256RegClass);
1443 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1444 : &X86::VR256RegClass);
1445 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1446 : &X86::VR256RegClass);
1447 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1448 : &X86::VR256RegClass);
1449 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1450 : &X86::VR256RegClass);
1451 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1452 : &X86::VR256RegClass);
1453
1454 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1455 setOperationAction(ISD::FFLOOR, VT, Legal);
1457 setOperationAction(ISD::FCEIL, VT, Legal);
1459 setOperationAction(ISD::FTRUNC, VT, Legal);
1461 setOperationAction(ISD::FRINT, VT, Legal);
1463 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1465 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1467
1468 setOperationAction(ISD::FROUND, VT, Custom);
1469
1470 setOperationAction(ISD::FNEG, VT, Custom);
1471 setOperationAction(ISD::FABS, VT, Custom);
1473
1474 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1475 setOperationAction(ISD::FMINIMUM, VT, Custom);
1476 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1477 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1479 }
1480
1481 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1482 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1483
1484 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1485 // even though v8i16 is a legal type.
1486 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1487 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1488 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1489 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1493
1496 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);
1498 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
1500
1512
1513 if (!Subtarget.hasAVX512())
1514 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1515
1516 // In the customized shift lowering, the legal v8i32/v4i64 cases
1517 // in AVX2 will be recognized.
1518 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1524 if (VT == MVT::v4i64) continue;
1529 }
1530
1531 // These types need custom splitting if their input is a 128-bit vector.
1536
1540 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1541 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1544
1545 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1549 }
1550
1555
1556 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1561
1562 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1563 // setcc all the way to isel and prefer SETGT in some isel patterns.
1566 }
1567
1568 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1569 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1574
1575 if (Subtarget.hasAnyFMA()) {
1576 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1577 MVT::v2f64, MVT::v4f64 }) {
1580 }
1581 }
1582
1583 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1584 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1585 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1586 }
1587
1588 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1589 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1590 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1591 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1592
1593 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1594 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1595 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1596 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1597 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1598 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1599 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1600 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1601
1602 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1603 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1604
1605 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1606 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1607 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1608 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1609 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1610
1611 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1612 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1613 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1614 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1615 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1616 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1617 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1618 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1623
1624 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1625 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1626 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1627 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1628 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1629 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1630 }
1631
1632 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1635 }
1636
1637 if (HasInt256) {
1638 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1639 // when we have a 256bit-wide blend with immediate.
1642
1643 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1644 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1645 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1646 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1647 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1648 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1649 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1650 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1651 }
1652 }
1653
1654 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1655 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1656 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1657 setOperationAction(ISD::MSTORE, VT, Legal);
1658 }
1659
1660 // Extract subvector is special because the value type
1661 // (result) is 128-bit but the source is 256-bit wide.
1662 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1663 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1665 }
1666
1667 // Custom lower several nodes for 256-bit types.
1668 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1669 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1678 setOperationAction(ISD::STORE, VT, Custom);
1679 }
1680 setF16Action(MVT::v16f16, Expand);
1681 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1682 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1684 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1685 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1686 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1687 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1688
1689 if (HasInt256) {
1691
1692 // Custom legalize 2x32 to get a little better code.
1693 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1694 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1695
1696 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1697 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1698 setOperationAction(ISD::MGATHER, VT, Custom);
1699 }
1700 }
1701
1702 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1703 Subtarget.hasF16C()) {
1704 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1707 }
1708 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1709 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1711 }
1712 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1713 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1714 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1715 }
1716 setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
1717 setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
1718 }
1719
1720 // This block controls legalization of the mask vector sizes that are
1721 // available with AVX512. 512-bit vectors are in a separate block controlled
1722 // by useAVX512Regs.
1723 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1724 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1725 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1726 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1727 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1728 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1729
1733
1734 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1735 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1736 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1737 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1738 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1739 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1740 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1741 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1749
1750 // There is no byte sized k-register load or store without AVX512DQ.
1751 if (!Subtarget.hasDQI()) {
1752 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1753 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1754 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1755 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1756
1757 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1758 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1759 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1760 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1761 }
1762
1763 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1764 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1768 }
1769
1770 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1772
1773 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1777
1784 }
1785
1786 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1788 }
1789 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1790 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1791 setOperationAction(ISD::LRINT, VT, Legal);
1792 setOperationAction(ISD::LLRINT, VT, Legal);
1793 }
1794 }
1795
1796 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1797 // elements. 512-bits can be disabled based on prefer-vector-width and
1798 // required-vector-width function attributes.
1799 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1800 bool HasBWI = Subtarget.hasBWI();
1801
1802 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1803 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1804 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1805 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1806 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1807 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1808 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1809
1810 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1811 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1812 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1813 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1814 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1815 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1816 if (HasBWI)
1817 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1818 }
1819
1820 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1821 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1822 setOperationAction(ISD::FMINIMUM, VT, Custom);
1823 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1824 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1825 setOperationAction(ISD::FNEG, VT, Custom);
1826 setOperationAction(ISD::FABS, VT, Custom);
1831 }
1832 setOperationAction(ISD::LRINT, MVT::v16f32,
1833 Subtarget.hasDQI() ? Legal : Custom);
1834 setOperationAction(ISD::LRINT, MVT::v8f64,
1835 Subtarget.hasDQI() ? Legal : Custom);
1836 if (Subtarget.hasDQI())
1837 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1838
1839 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1844 }
1845
1846 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1851 }
1852
1857 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
1859
1871
1872 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1873 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1874 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1875 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1876 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1877 if (HasBWI)
1878 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1879
1880 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1881 // to 512-bit rather than use the AVX2 instructions so that we can use
1882 // k-masks.
1883 if (!Subtarget.hasVLX()) {
1884 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1885 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1886 setOperationAction(ISD::MLOAD, VT, Custom);
1887 setOperationAction(ISD::MSTORE, VT, Custom);
1888 }
1889 }
1890
1892 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1893 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1903
1904 if (HasBWI) {
1905 // Extends from v64i1 masks to 512-bit vectors.
1909 }
1910
1911 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1912 setOperationAction(ISD::FFLOOR, VT, Legal);
1914 setOperationAction(ISD::FCEIL, VT, Legal);
1916 setOperationAction(ISD::FTRUNC, VT, Legal);
1918 setOperationAction(ISD::FRINT, VT, Legal);
1920 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1922 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1924
1925 setOperationAction(ISD::FROUND, VT, Custom);
1926 }
1927
1928 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1931 }
1932
1933 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1934 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1935 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1936 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1937
1938 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1939 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1940 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1941 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1942
1943 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1944 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1945 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1946 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1947 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1948 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1949 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1950 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1951
1952 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1953 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1954
1955 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1965
1966 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1967 // setcc all the way to isel and prefer SETGT in some isel patterns.
1970 }
1971
1972 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1973 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1978
1979 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1986 }
1987
1988 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1989 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1990 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1992 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1993 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1994 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1995 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
2000 }
2001
2002 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
2003 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
2004 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
2005 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
2006 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
2007 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
2008
2009 if (Subtarget.hasDQI() || Subtarget.hasFP16())
2013 setOperationAction(Opc, MVT::v8i64, Custom);
2014
2015 if (Subtarget.hasDQI())
2016 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2017
2018 if (Subtarget.hasCDI()) {
2019 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2020 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
2022 }
2023 } // Subtarget.hasCDI()
2024
2025 if (Subtarget.hasVPOPCNTDQ()) {
2026 for (auto VT : { MVT::v16i32, MVT::v8i64 })
2028 }
2029
2030 // Extract subvector is special because the value type
2031 // (result) is 256-bit but the source is 512-bit wide.
2032 // 128-bit was made Legal under AVX1.
2033 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2034 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2036
2037 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2038 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2048 }
2049 setF16Action(MVT::v32f16, Expand);
2052 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
2054 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2055 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2056 setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
2057
2058 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2059 setOperationAction(ISD::MLOAD, VT, Legal);
2060 setOperationAction(ISD::MSTORE, VT, Legal);
2061 setOperationAction(ISD::MGATHER, VT, Custom);
2062 setOperationAction(ISD::MSCATTER, VT, Custom);
2063 }
2064 if (HasBWI) {
2065 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2066 setOperationAction(ISD::MLOAD, VT, Legal);
2067 setOperationAction(ISD::MSTORE, VT, Legal);
2068 }
2069 } else {
2070 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2071 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2072 }
2073
2074 if (Subtarget.hasVBMI2()) {
2075 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2078 }
2079
2080 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2081 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2082 }
2083
2084 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2085 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2087 }// useAVX512Regs
2088
2089 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2090 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2091 MVT::v4i64}) {
2094 }
2095 }
2096
2097 // This block controls legalization for operations that don't have
2098 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2099 // narrower widths.
2100 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2101 // These operations are handled on non-VLX by artificially widening in
2102 // isel patterns.
2103
2107
2108 if (Subtarget.hasDQI()) {
2109 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2110 // v2f32 UINT_TO_FP is already custom under SSE2.
2113 "Unexpected operation action!");
2114 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2119 }
2120
2121 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2127 }
2128
2129 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2132 }
2133
2134 // Custom legalize 2x32 to get a little better code.
2135 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
2136 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
2137
2138 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2139 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2140 setOperationAction(ISD::MSCATTER, VT, Custom);
2141
2142 if (Subtarget.hasDQI()) {
2146 setOperationAction(Opc, MVT::v2i64, Custom);
2147 setOperationAction(Opc, MVT::v4i64, Custom);
2148 }
2149 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2150 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2151 }
2152
2153 if (Subtarget.hasCDI()) {
2154 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2156 }
2157 } // Subtarget.hasCDI()
2158
2159 if (Subtarget.hasVPOPCNTDQ()) {
2160 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2162 }
2163
2164 // We can try to convert vectors to different sizes to leverage legal
2165 // `vpcompress` cases. So we mark these supported vector sizes as Custom and
2166 // then specialize to Legal below.
2167 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2168 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2169 MVT::v16i16, MVT::v8i8})
2171
2172 // Legal vpcompress depends on various AVX512 extensions.
2173 // Legal in AVX512F
2174 for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2176
2177 // Legal in AVX512F + AVX512VL
2178 if (Subtarget.hasVLX())
2179 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2180 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2182
2183 // Legal in AVX512F + AVX512VBMI2
2184 if (Subtarget.hasVBMI2())
2185 for (MVT VT : {MVT::v32i16, MVT::v64i8})
2187
2188 // Legal in AVX512F + AVX512VL + AVX512VBMI2
2189 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2190 for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2192 }
2193
2194 // This block control legalization of v32i1/v64i1 which are available with
2195 // AVX512BW..
2196 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2197 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2198 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2199
2200 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2211 }
2212
2213 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2215
2216 // Extends from v32i1 masks to 256-bit vectors.
2220
2221 for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
2222 MVT::v16f16, MVT::v8f16}) {
2223 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2224 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2225 }
2226
2227 // These operations are handled on non-VLX by artificially widening in
2228 // isel patterns.
2229 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2230
2231 if (Subtarget.hasBITALG()) {
2232 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2234 }
2235 }
2236
2237 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2238 auto setGroup = [&] (MVT VT) {
2247 setOperationAction(ISD::FSQRT, VT, Legal);
2249
2250 setOperationAction(ISD::FFLOOR, VT, Legal);
2252 setOperationAction(ISD::FCEIL, VT, Legal);
2254 setOperationAction(ISD::FTRUNC, VT, Legal);
2256 setOperationAction(ISD::FRINT, VT, Legal);
2258 setOperationAction(ISD::FNEARBYINT, VT, Legal);
2260 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
2262
2263 setOperationAction(ISD::FROUND, VT, Custom);
2264
2265 setOperationAction(ISD::LOAD, VT, Legal);
2266 setOperationAction(ISD::STORE, VT, Legal);
2267
2273
2274 setOperationAction(ISD::FNEG, VT, Custom);
2275 setOperationAction(ISD::FABS, VT, Custom);
2279
2283 };
2284
2285 // AVX512_FP16 scalar operations
2286 setGroup(MVT::f16);
2290 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
2292 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
2296 setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom);
2297 setOperationAction(ISD::FMINIMUM, MVT::f16, Custom);
2298 setOperationAction(ISD::FMAXIMUMNUM, MVT::f16, Custom);
2299 setOperationAction(ISD::FMINIMUMNUM, MVT::f16, Custom);
2300 setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
2302 setOperationAction(ISD::LRINT, MVT::f16, Legal);
2303 setOperationAction(ISD::LLRINT, MVT::f16, Legal);
2304
2307
2308 if (Subtarget.useAVX512Regs()) {
2309 setGroup(MVT::v32f16);
2315 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2317 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
2319 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
2322
2327 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2329 MVT::v32i16);
2330 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2332 MVT::v32i16);
2333 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2335 MVT::v32i16);
2336 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2338 MVT::v32i16);
2339
2343
2344 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2345 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2346
2347 setOperationAction(ISD::FMINIMUM, MVT::v32f16, Custom);
2348 setOperationAction(ISD::FMAXIMUM, MVT::v32f16, Custom);
2349 setOperationAction(ISD::FMINIMUMNUM, MVT::v32f16, Custom);
2350 setOperationAction(ISD::FMAXIMUMNUM, MVT::v32f16, Custom);
2351 setOperationAction(ISD::LRINT, MVT::v32f16, Legal);
2352 setOperationAction(ISD::LLRINT, MVT::v8f16, Legal);
2353 }
2354
2359
2360 if (Subtarget.hasVLX()) {
2361 setGroup(MVT::v8f16);
2362 setGroup(MVT::v16f16);
2363
2374
2377 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Custom);
2379 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
2381
2382 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2385
2389
2390 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2391 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2392 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2393 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2394
2395 // Need to custom widen these to prevent scalarization.
2396 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2397 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2398
2399 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Custom);
2400 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Custom);
2401 setOperationAction(ISD::FMINIMUMNUM, MVT::v8f16, Custom);
2402 setOperationAction(ISD::FMAXIMUMNUM, MVT::v8f16, Custom);
2403
2404 setOperationAction(ISD::FMINIMUM, MVT::v16f16, Custom);
2405 setOperationAction(ISD::FMAXIMUM, MVT::v16f16, Custom);
2406 setOperationAction(ISD::FMINIMUMNUM, MVT::v16f16, Custom);
2407 setOperationAction(ISD::FMAXIMUMNUM, MVT::v16f16, Custom);
2408 setOperationAction(ISD::LRINT, MVT::v8f16, Legal);
2409 setOperationAction(ISD::LRINT, MVT::v16f16, Legal);
2410 }
2411 }
2412
2413 if (!Subtarget.useSoftFloat() &&
2414 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2415 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2416 : &X86::VR128RegClass);
2417 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2418 : &X86::VR256RegClass);
2419 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2420 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2421 // Set the operation action Custom to do the customization later.
2424 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2425 setF16Action(VT, Expand);
2426 if (!Subtarget.hasBF16())
2432 }
2433 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2434 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2435 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2436 }
2437 setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
2438 setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
2440 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2441 }
2442
2443 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2444 Subtarget.useAVX512Regs()) {
2445 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2446 setF16Action(MVT::v32bf16, Expand);
2447 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2448 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2449 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2451 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2455 }
2456
2457 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2458 setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
2459 setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
2460 setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
2461 setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
2462 setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
2463 setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
2464 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2465 setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
2466 setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
2467 setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom);
2468 setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom);
2469 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2474 setOperationAction(ISD::FSQRT, VT, Legal);
2477 setOperationAction(ISD::FMINIMUM, VT, Custom);
2478 setOperationAction(ISD::FMAXIMUM, VT, Custom);
2479 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
2480 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
2481 }
2482 for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2485 }
2486 }
2487
2488 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2489 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2490 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2491 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2492 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2493 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2494
2495 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2496 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2497 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2498 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2499 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2500
2501 if (Subtarget.hasBWI()) {
2502 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2503 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2504 }
2505
2506 if (Subtarget.hasFP16()) {
2507 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2516 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2525 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2530 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2531 setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
2533 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
2535 }
2536 }
2537
2538 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2539 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2540 }
2541
2542 // We want to custom lower some of our intrinsics.
2546 if (!Subtarget.is64Bit()) {
2548 }
2549
2550 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2551 // handle type legalization for these operations here.
2552 //
2553 // FIXME: We really should do custom legalization for addition and
2554 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2555 // than generic legalization for 64-bit multiplication-with-overflow, though.
2556 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2557 if (VT == MVT::i64 && !Subtarget.is64Bit())
2558 continue;
2559 // Add/Sub/Mul with overflow operations are custom lowered.
2566
2567 // Support carry in as value rather than glue.
2573 }
2574
2575 // Combine sin / cos into _sincos_stret if it is available.
2576 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2577 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2578 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2579 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2580 }
2581
2582 if (Subtarget.isTargetWin64()) {
2583 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2584 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2585 setOperationAction(ISD::SREM, MVT::i128, Custom);
2586 setOperationAction(ISD::UREM, MVT::i128, Custom);
2595 }
2596
2597 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2598 // is. We should promote the value to 64-bits to solve this.
2599 // This is what the CRT headers do - `fmodf` is an inline header
2600 // function casting to f64 and calling `fmod`.
2601 if (Subtarget.is32Bit() &&
2602 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2603 // clang-format off
2604 for (ISD::NodeType Op :
2605 {ISD::FACOS, ISD::STRICT_FACOS,
2606 ISD::FASIN, ISD::STRICT_FASIN,
2607 ISD::FATAN, ISD::STRICT_FATAN,
2608 ISD::FATAN2, ISD::STRICT_FATAN2,
2609 ISD::FCEIL, ISD::STRICT_FCEIL,
2610 ISD::FCOS, ISD::STRICT_FCOS,
2611 ISD::FCOSH, ISD::STRICT_FCOSH,
2612 ISD::FEXP, ISD::STRICT_FEXP,
2613 ISD::FFLOOR, ISD::STRICT_FFLOOR,
2615 ISD::FLOG, ISD::STRICT_FLOG,
2616 ISD::FLOG10, ISD::STRICT_FLOG10,
2617 ISD::FPOW, ISD::STRICT_FPOW,
2618 ISD::FSIN, ISD::STRICT_FSIN,
2619 ISD::FSINH, ISD::STRICT_FSINH,
2620 ISD::FTAN, ISD::STRICT_FTAN,
2621 ISD::FTANH, ISD::STRICT_FTANH,
2622 // TODO: Add ISD:::STRICT_FMODF too once implemented.
2623 ISD::FMODF})
2624 if (isOperationExpand(Op, MVT::f32))
2625 setOperationAction(Op, MVT::f32, Promote);
2626 // clang-format on
2627
2628 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2629 // it, but it's just a wrapper around ldexp.
2630 if (Subtarget.isOSWindows()) {
2631 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
2632 if (isOperationExpand(Op, MVT::f32))
2633 setOperationAction(Op, MVT::f32, Promote);
2634 }
2635
2636 // We have target-specific dag combine patterns for the following nodes:
2644 ISD::BITCAST,
2647 ISD::SHL,
2648 ISD::SRA,
2649 ISD::SRL,
2650 ISD::OR,
2651 ISD::AND,
2657 ISD::ADD,
2658 ISD::FADD,
2659 ISD::FSUB,
2660 ISD::FNEG,
2661 ISD::FMA,
2663 ISD::FMINNUM,
2664 ISD::FMAXNUM,
2665 ISD::SUB,
2666 ISD::LOAD,
2667 ISD::LRINT,
2668 ISD::LLRINT,
2669 ISD::MLOAD,
2670 ISD::STORE,
2671 ISD::MSTORE,
2687 ISD::SETCC,
2688 ISD::MUL,
2689 ISD::XOR,
2690 ISD::MSCATTER,
2691 ISD::MGATHER,
2692 ISD::FP16_TO_FP,
2693 ISD::FP_EXTEND,
2700
2701 computeRegisterProperties(Subtarget.getRegisterInfo());
2702
2703 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2705 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2707 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2709
2710 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2711 // that needs to benchmarked and balanced with the potential use of vector
2712 // load/store types (PR33329, PR33914).
2715
2716 // Default loop alignment, which can be overridden by -align-loops.
2718
2719 // An out-of-order CPU can speculatively execute past a predictable branch,
2720 // but a conditional move could be stalled by an expensive earlier operation.
2721 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2722 EnableExtLdPromotion = true;
2724
2726
2727 // Default to having -disable-strictnode-mutation on
2728 IsStrictFPEnabled = true;
2729}
2730
2731// This has so far only been implemented for 64-bit MachO.
2733 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2734}
2735
2737 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2738 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2739}
2740
2742 const SDLoc &DL) const {
2743 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2744 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2745 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2746 return SDValue(Node, 0);
2747}
2748
2751 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2752 !Subtarget.hasBWI())
2753 return TypeSplitVector;
2754
2755 // Since v8f16 is legal, widen anything over v4f16.
2756 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2757 VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() &&
2758 VT.getVectorElementType() == MVT::f16)
2759 return TypeSplitVector;
2760
2761 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2762 VT.getVectorElementType() != MVT::i1)
2763 return TypeWidenVector;
2764
2766}
2767
2768FastISel *
2770 const TargetLibraryInfo *libInfo) const {
2771 return X86::createFastISel(funcInfo, libInfo);
2772}
2773
2774//===----------------------------------------------------------------------===//
2775// Other Lowering Hooks
2776//===----------------------------------------------------------------------===//
2777
2779 bool AssumeSingleUse) {
2780 if (!AssumeSingleUse && !Op.hasOneUse())
2781 return false;
2782 if (!ISD::isNormalLoad(Op.getNode()))
2783 return false;
2784
2785 // If this is an unaligned vector, make sure the target supports folding it.
2786 auto *Ld = cast<LoadSDNode>(Op.getNode());
2787 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2788 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2789 return false;
2790
2791 // TODO: If this is a non-temporal load and the target has an instruction
2792 // for it, it should not be folded. See "useNonTemporalLoad()".
2793
2794 return true;
2795}
2796
2798 const X86Subtarget &Subtarget,
2799 bool AssumeSingleUse) {
2800 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2801 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2802 return false;
2803
2804 // We can not replace a wide volatile load with a broadcast-from-memory,
2805 // because that would narrow the load, which isn't legal for volatiles.
2806 auto *Ld = cast<LoadSDNode>(Op.getNode());
2807 return !Ld->isVolatile() ||
2808 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2809}
2810
2812 if (!Op.hasOneUse())
2813 return false;
2814 // Peek through (oneuse) bitcast users
2815 SDNode *User = *Op->user_begin();
2816 while (User->getOpcode() == ISD::BITCAST) {
2817 if (!User->hasOneUse())
2818 return false;
2819 User = *User->user_begin();
2820 }
2821 return ISD::isNormalStore(User);
2822}
2823
2825 if (Op.hasOneUse()) {
2826 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2827 return (ISD::ZERO_EXTEND == Opcode);
2828 }
2829 return false;
2830}
2831
2832static bool isLogicOp(unsigned Opcode) {
2833 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2834 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2835}
2836
2837static bool isTargetShuffle(unsigned Opcode) {
2838 switch(Opcode) {
2839 default: return false;
2840 case X86ISD::BLENDI:
2841 case X86ISD::PSHUFB:
2842 case X86ISD::PSHUFD:
2843 case X86ISD::PSHUFHW:
2844 case X86ISD::PSHUFLW:
2845 case X86ISD::SHUFP:
2846 case X86ISD::INSERTPS:
2847 case X86ISD::EXTRQI:
2848 case X86ISD::INSERTQI:
2849 case X86ISD::VALIGN:
2850 case X86ISD::PALIGNR:
2851 case X86ISD::VSHLDQ:
2852 case X86ISD::VSRLDQ:
2853 case X86ISD::MOVLHPS:
2854 case X86ISD::MOVHLPS:
2855 case X86ISD::MOVSHDUP:
2856 case X86ISD::MOVSLDUP:
2857 case X86ISD::MOVDDUP:
2858 case X86ISD::MOVSS:
2859 case X86ISD::MOVSD:
2860 case X86ISD::MOVSH:
2861 case X86ISD::UNPCKL:
2862 case X86ISD::UNPCKH:
2863 case X86ISD::VBROADCAST:
2864 case X86ISD::VPERMILPI:
2865 case X86ISD::VPERMILPV:
2866 case X86ISD::VPERM2X128:
2867 case X86ISD::SHUF128:
2868 case X86ISD::VPERMIL2:
2869 case X86ISD::VPERMI:
2870 case X86ISD::VPPERM:
2871 case X86ISD::VPERMV:
2872 case X86ISD::VPERMV3:
2873 case X86ISD::VZEXT_MOVL:
2874 return true;
2875 }
2876}
2877
2878static bool isTargetShuffleVariableMask(unsigned Opcode) {
2879 switch (Opcode) {
2880 default: return false;
2881 // Target Shuffles.
2882 case X86ISD::PSHUFB:
2883 case X86ISD::VPERMILPV:
2884 case X86ISD::VPERMIL2:
2885 case X86ISD::VPPERM:
2886 case X86ISD::VPERMV:
2887 case X86ISD::VPERMV3:
2888 return true;
2889 // 'Faux' Target Shuffles.
2890 case ISD::OR:
2891 case ISD::AND:
2892 case X86ISD::ANDNP:
2893 return true;
2894 }
2895}
2896
2899 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2901 int ReturnAddrIndex = FuncInfo->getRAIndex();
2902
2903 if (ReturnAddrIndex == 0) {
2904 // Set up a frame object for the return address.
2905 unsigned SlotSize = RegInfo->getSlotSize();
2906 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2907 -(int64_t)SlotSize,
2908 false);
2909 FuncInfo->setRAIndex(ReturnAddrIndex);
2910 }
2911
2912 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2913}
2914
2916 bool HasSymbolicDisplacement) {
2917 // Offset should fit into 32 bit immediate field.
2918 if (!isInt<32>(Offset))
2919 return false;
2920
2921 // If we don't have a symbolic displacement - we don't have any extra
2922 // restrictions.
2923 if (!HasSymbolicDisplacement)
2924 return true;
2925
2926 // We can fold large offsets in the large code model because we always use
2927 // 64-bit offsets.
2928 if (CM == CodeModel::Large)
2929 return true;
2930
2931 // For kernel code model we know that all object resist in the negative half
2932 // of 32bits address space. We may not accept negative offsets, since they may
2933 // be just off and we may accept pretty large positive ones.
2934 if (CM == CodeModel::Kernel)
2935 return Offset >= 0;
2936
2937 // For other non-large code models we assume that latest small object is 16MB
2938 // before end of 31 bits boundary. We may also accept pretty large negative
2939 // constants knowing that all objects are in the positive half of address
2940 // space.
2941 return Offset < 16 * 1024 * 1024;
2942}
2943
2944/// Return true if the condition is an signed comparison operation.
2945static bool isX86CCSigned(X86::CondCode X86CC) {
2946 switch (X86CC) {
2947 default:
2948 llvm_unreachable("Invalid integer condition!");
2949 case X86::COND_E:
2950 case X86::COND_NE:
2951 case X86::COND_B:
2952 case X86::COND_A:
2953 case X86::COND_BE:
2954 case X86::COND_AE:
2955 return false;
2956 case X86::COND_G:
2957 case X86::COND_GE:
2958 case X86::COND_L:
2959 case X86::COND_LE:
2960 return true;
2961 }
2962}
2963
2965 switch (SetCCOpcode) {
2966 // clang-format off
2967 default: llvm_unreachable("Invalid integer condition!");
2968 case ISD::SETEQ: return X86::COND_E;
2969 case ISD::SETGT: return X86::COND_G;
2970 case ISD::SETGE: return X86::COND_GE;
2971 case ISD::SETLT: return X86::COND_L;
2972 case ISD::SETLE: return X86::COND_LE;
2973 case ISD::SETNE: return X86::COND_NE;
2974 case ISD::SETULT: return X86::COND_B;
2975 case ISD::SETUGT: return X86::COND_A;
2976 case ISD::SETULE: return X86::COND_BE;
2977 case ISD::SETUGE: return X86::COND_AE;
2978 // clang-format on
2979 }
2980}
2981
2982/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2983/// condition code, returning the condition code and the LHS/RHS of the
2984/// comparison to make.
2986 bool isFP, SDValue &LHS, SDValue &RHS,
2987 SelectionDAG &DAG) {
2988 if (!isFP) {
2990 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2991 // X > -1 -> X == 0, jump !sign.
2992 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2993 return X86::COND_NS;
2994 }
2995 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2996 // X < 0 -> X == 0, jump on sign.
2997 return X86::COND_S;
2998 }
2999 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
3000 // X >= 0 -> X == 0, jump on !sign.
3001 return X86::COND_NS;
3002 }
3003 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
3004 // X < 1 -> X <= 0
3005 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3006 return X86::COND_LE;
3007 }
3008 }
3009
3010 return TranslateIntegerX86CC(SetCCOpcode);
3011 }
3012
3013 // First determine if it is required or is profitable to flip the operands.
3014
3015 // If LHS is a foldable load, but RHS is not, flip the condition.
3016 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3017 !ISD::isNON_EXTLoad(RHS.getNode())) {
3018 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3019 std::swap(LHS, RHS);
3020 }
3021
3022 switch (SetCCOpcode) {
3023 default: break;
3024 case ISD::SETOLT:
3025 case ISD::SETOLE:
3026 case ISD::SETUGT:
3027 case ISD::SETUGE:
3028 std::swap(LHS, RHS);
3029 break;
3030 }
3031
3032 // On a floating point condition, the flags are set as follows:
3033 // ZF PF CF op
3034 // 0 | 0 | 0 | X > Y
3035 // 0 | 0 | 1 | X < Y
3036 // 1 | 0 | 0 | X == Y
3037 // 1 | 1 | 1 | unordered
3038 switch (SetCCOpcode) {
3039 // clang-format off
3040 default: llvm_unreachable("Condcode should be pre-legalized away");
3041 case ISD::SETUEQ:
3042 case ISD::SETEQ: return X86::COND_E;
3043 case ISD::SETOLT: // flipped
3044 case ISD::SETOGT:
3045 case ISD::SETGT: return X86::COND_A;
3046 case ISD::SETOLE: // flipped
3047 case ISD::SETOGE:
3048 case ISD::SETGE: return X86::COND_AE;
3049 case ISD::SETUGT: // flipped
3050 case ISD::SETULT:
3051 case ISD::SETLT: return X86::COND_B;
3052 case ISD::SETUGE: // flipped
3053 case ISD::SETULE:
3054 case ISD::SETLE: return X86::COND_BE;
3055 case ISD::SETONE:
3056 case ISD::SETNE: return X86::COND_NE;
3057 case ISD::SETUO: return X86::COND_P;
3058 case ISD::SETO: return X86::COND_NP;
3059 case ISD::SETOEQ:
3060 case ISD::SETUNE: return X86::COND_INVALID;
3061 // clang-format on
3062 }
3063}
3064
3065/// Is there a floating point cmov for the specific X86 condition code?
3066/// Current x86 isa includes the following FP cmov instructions:
3067/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3068static bool hasFPCMov(unsigned X86CC) {
3069 switch (X86CC) {
3070 default:
3071 return false;
3072 case X86::COND_B:
3073 case X86::COND_BE:
3074 case X86::COND_E:
3075 case X86::COND_P:
3076 case X86::COND_A:
3077 case X86::COND_AE:
3078 case X86::COND_NE:
3079 case X86::COND_NP:
3080 return true;
3081 }
3082}
3083
3084static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
3085 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
3086 VT.is512BitVector();
3087}
3088
3090 const CallInst &I,
3091 MachineFunction &MF,
3092 unsigned Intrinsic) const {
3093 Info.flags = MachineMemOperand::MONone;
3094 Info.offset = 0;
3095
3097 if (!IntrData) {
3098 switch (Intrinsic) {
3099 case Intrinsic::x86_aesenc128kl:
3100 case Intrinsic::x86_aesdec128kl:
3101 Info.opc = ISD::INTRINSIC_W_CHAIN;
3102 Info.ptrVal = I.getArgOperand(1);
3103 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3104 Info.align = Align(1);
3105 Info.flags |= MachineMemOperand::MOLoad;
3106 return true;
3107 case Intrinsic::x86_aesenc256kl:
3108 case Intrinsic::x86_aesdec256kl:
3109 Info.opc = ISD::INTRINSIC_W_CHAIN;
3110 Info.ptrVal = I.getArgOperand(1);
3111 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3112 Info.align = Align(1);
3113 Info.flags |= MachineMemOperand::MOLoad;
3114 return true;
3115 case Intrinsic::x86_aesencwide128kl:
3116 case Intrinsic::x86_aesdecwide128kl:
3117 Info.opc = ISD::INTRINSIC_W_CHAIN;
3118 Info.ptrVal = I.getArgOperand(0);
3119 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3120 Info.align = Align(1);
3121 Info.flags |= MachineMemOperand::MOLoad;
3122 return true;
3123 case Intrinsic::x86_aesencwide256kl:
3124 case Intrinsic::x86_aesdecwide256kl:
3125 Info.opc = ISD::INTRINSIC_W_CHAIN;
3126 Info.ptrVal = I.getArgOperand(0);
3127 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3128 Info.align = Align(1);
3129 Info.flags |= MachineMemOperand::MOLoad;
3130 return true;
3131 case Intrinsic::x86_cmpccxadd32:
3132 case Intrinsic::x86_cmpccxadd64:
3133 case Intrinsic::x86_atomic_bts:
3134 case Intrinsic::x86_atomic_btc:
3135 case Intrinsic::x86_atomic_btr: {
3136 Info.opc = ISD::INTRINSIC_W_CHAIN;
3137 Info.ptrVal = I.getArgOperand(0);
3138 unsigned Size = I.getType()->getScalarSizeInBits();
3139 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3140 Info.align = Align(Size);
3143 return true;
3144 }
3145 case Intrinsic::x86_atomic_bts_rm:
3146 case Intrinsic::x86_atomic_btc_rm:
3147 case Intrinsic::x86_atomic_btr_rm: {
3148 Info.opc = ISD::INTRINSIC_W_CHAIN;
3149 Info.ptrVal = I.getArgOperand(0);
3150 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3151 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3152 Info.align = Align(Size);
3155 return true;
3156 }
3157 case Intrinsic::x86_aadd32:
3158 case Intrinsic::x86_aadd64:
3159 case Intrinsic::x86_aand32:
3160 case Intrinsic::x86_aand64:
3161 case Intrinsic::x86_aor32:
3162 case Intrinsic::x86_aor64:
3163 case Intrinsic::x86_axor32:
3164 case Intrinsic::x86_axor64:
3165 case Intrinsic::x86_atomic_add_cc:
3166 case Intrinsic::x86_atomic_sub_cc:
3167 case Intrinsic::x86_atomic_or_cc:
3168 case Intrinsic::x86_atomic_and_cc:
3169 case Intrinsic::x86_atomic_xor_cc: {
3170 Info.opc = ISD::INTRINSIC_W_CHAIN;
3171 Info.ptrVal = I.getArgOperand(0);
3172 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3173 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3174 Info.align = Align(Size);
3177 return true;
3178 }
3179 }
3180 return false;
3181 }
3182
3183 switch (IntrData->Type) {
3186 case TRUNCATE_TO_MEM_VI32: {
3187 Info.opc = ISD::INTRINSIC_VOID;
3188 Info.ptrVal = I.getArgOperand(0);
3189 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3191 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3192 ScalarVT = MVT::i8;
3193 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3194 ScalarVT = MVT::i16;
3195 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3196 ScalarVT = MVT::i32;
3197
3198 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3199 Info.align = Align(1);
3200 Info.flags |= MachineMemOperand::MOStore;
3201 break;
3202 }
3203 case GATHER:
3204 case GATHER_AVX2: {
3205 Info.opc = ISD::INTRINSIC_W_CHAIN;
3206 Info.ptrVal = nullptr;
3207 MVT DataVT = MVT::getVT(I.getType());
3208 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3209 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3210 IndexVT.getVectorNumElements());
3211 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3212 Info.align = Align(1);
3213 Info.flags |= MachineMemOperand::MOLoad;
3214 break;
3215 }
3216 case SCATTER: {
3217 Info.opc = ISD::INTRINSIC_VOID;
3218 Info.ptrVal = nullptr;
3219 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3220 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3221 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3222 IndexVT.getVectorNumElements());
3223 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3224 Info.align = Align(1);
3225 Info.flags |= MachineMemOperand::MOStore;
3226 break;
3227 }
3228 default:
3229 return false;
3230 }
3231
3232 return true;
3233}
3234
3235/// Returns true if the target can instruction select the
3236/// specified FP immediate natively. If false, the legalizer will
3237/// materialize the FP immediate as a load from a constant pool.
3239 bool ForCodeSize) const {
3240 for (const APFloat &FPImm : LegalFPImmediates)
3241 if (Imm.bitwiseIsEqual(FPImm))
3242 return true;
3243 return false;
3244}
3245
3247 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
3248 std::optional<unsigned> ByteOffset) const {
3249 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3250
3251 auto PeekThroughOneUserBitcasts = [](const SDNode *N) {
3252 while (N->getOpcode() == ISD::BITCAST && N->hasOneUse())
3253 N = *N->user_begin();
3254 return N;
3255 };
3256
3257 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3258 // relocation target a movq or addq instruction: don't let the load shrink.
3259 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3260 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3261 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3262 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3263
3264 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3265 // those uses are extracted directly into a store, then the extract + store
3266 // can be store-folded, or (4) any use will be used by legal full width
3267 // instruction. Then, it's probably not worth splitting the load.
3268 EVT VT = Load->getValueType(0);
3269 if ((VT.is256BitVector() || VT.is512BitVector()) &&
3270 !SDValue(Load, 0).hasOneUse()) {
3271 bool FullWidthUse = false;
3272 bool AllExtractStores = true;
3273 for (SDUse &Use : Load->uses()) {
3274 // Skip uses of the chain value. Result 0 of the node is the load value.
3275 if (Use.getResNo() != 0)
3276 continue;
3277
3278 const SDNode *User = PeekThroughOneUserBitcasts(Use.getUser());
3279
3280 // If this use is an extract + store, it's probably not worth splitting.
3281 if (User->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
3282 all_of(User->uses(), [&](const SDUse &U) {
3283 const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());
3284 return Inner->getOpcode() == ISD::STORE;
3285 }))
3286 continue;
3287
3288 AllExtractStores = false;
3289
3290 // If any use is a full width legal/target bin op, then assume its legal
3291 // and won't split.
3292 if (isBinOp(User->getOpcode()) &&
3293 (isOperationLegal(User->getOpcode(), User->getValueType(0)) ||
3294 User->getOpcode() > ISD::BUILTIN_OP_END))
3295 FullWidthUse = true;
3296 }
3297
3298 if (AllExtractStores)
3299 return false;
3300
3301 // If we have an user that uses the full vector width, then this use is
3302 // only worth splitting if the offset isn't 0 (to avoid an
3303 // EXTRACT_SUBVECTOR) or we're loading a scalar integer.
3304 if (FullWidthUse)
3305 return (ByteOffset.value_or(0) > 0) || NewVT.isScalarInteger();
3306 }
3307
3308 return true;
3309}
3310
3311/// Returns true if it is beneficial to convert a load of a constant
3312/// to just the constant itself.
3314 Type *Ty) const {
3315 assert(Ty->isIntegerTy());
3316
3317 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3318 if (BitSize == 0 || BitSize > 64)
3319 return false;
3320 return true;
3321}
3322
3324 // If we are using XMM registers in the ABI and the condition of the select is
3325 // a floating-point compare and we have blendv or conditional move, then it is
3326 // cheaper to select instead of doing a cross-register move and creating a
3327 // load that depends on the compare result.
3328 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3329 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3330}
3331
3333 // TODO: It might be a win to ease or lift this restriction, but the generic
3334 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3335 if (VT.isVector() && Subtarget.hasAVX512())
3336 return false;
3337
3338 return true;
3339}
3340
3342 SDValue C) const {
3343 // TODO: We handle scalars using custom code, but generic combining could make
3344 // that unnecessary.
3345 APInt MulC;
3346 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3347 return false;
3348
3349 // Find the type this will be legalized too. Otherwise we might prematurely
3350 // convert this to shl+add/sub and then still have to type legalize those ops.
3351 // Another choice would be to defer the decision for illegal types until
3352 // after type legalization. But constant splat vectors of i64 can't make it
3353 // through type legalization on 32-bit targets so we would need to special
3354 // case vXi64.
3355 while (getTypeAction(Context, VT) != TypeLegal)
3356 VT = getTypeToTransformTo(Context, VT);
3357
3358 // If vector multiply is legal, assume that's faster than shl + add/sub.
3359 // Multiply is a complex op with higher latency and lower throughput in
3360 // most implementations, sub-vXi32 vector multiplies are always fast,
3361 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3362 // is always going to be slow.
3363 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3364 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3365 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3366 return false;
3367
3368 // shl+add, shl+sub, shl+add+neg
3369 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3370 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3371}
3372
3374 unsigned Index) const {
3376 return false;
3377
3378 // Mask vectors support all subregister combinations and operations that
3379 // extract half of vector.
3380 if (ResVT.getVectorElementType() == MVT::i1)
3381 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3382 (Index == ResVT.getVectorNumElements()));
3383
3384 return (Index % ResVT.getVectorNumElements()) == 0;
3385}
3386
3388 unsigned Opc = VecOp.getOpcode();
3389
3390 // Assume target opcodes can't be scalarized.
3391 // TODO - do we have any exceptions?
3392 if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
3393 return false;
3394
3395 // If the vector op is not supported, try to convert to scalar.
3396 EVT VecVT = VecOp.getValueType();
3398 return true;
3399
3400 // If the vector op is supported, but the scalar op is not, the transform may
3401 // not be worthwhile.
3402 EVT ScalarVT = VecVT.getScalarType();
3403 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3404}
3405
3407 bool) const {
3408 // TODO: Allow vectors?
3409 if (VT.isVector())
3410 return false;
3411 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3412}
3413
3415 // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
3416 // i32/i64 or can rely on BSF passthrough value.
3417 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3418 Subtarget.hasBitScanPassThrough() ||
3419 (!Ty->isVectorTy() &&
3420 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3421}
3422
3424 // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
3425 // passthrough value.
3426 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3427 Subtarget.hasBitScanPassThrough();
3428}
3429
3431 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3432 // expensive than a straight movsd. On the other hand, it's important to
3433 // shrink long double fp constant since fldt is very slow.
3434 return !Subtarget.hasSSE2() || VT == MVT::f80;
3435}
3436
3438 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3439 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3440}
3441
3443 const SelectionDAG &DAG,
3444 const MachineMemOperand &MMO) const {
3445 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3446 BitcastVT.getVectorElementType() == MVT::i1)
3447 return false;
3448
3449 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3450 return false;
3451
3452 // If both types are legal vectors, it's always ok to convert them.
3453 if (LoadVT.isVector() && BitcastVT.isVector() &&
3454 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3455 return true;
3456
3457 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3458}
3459
3461 const MachineFunction &MF) const {
3462 // Do not merge to float value size (128 bytes) if no implicit
3463 // float attribute is set.
3464 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3465
3466 if (NoFloat) {
3467 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3468 return (MemVT.getSizeInBits() <= MaxIntSize);
3469 }
3470 // Make sure we don't merge greater than our preferred vector
3471 // width.
3472 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3473 return false;
3474
3475 return true;
3476}
3477
3479 return Subtarget.hasFastLZCNT();
3480}
3481
3483 const Instruction &AndI) const {
3484 return true;
3485}
3486
3488 EVT VT = Y.getValueType();
3489
3490 if (VT.isVector())
3491 return false;
3492
3493 if (!Subtarget.hasBMI())
3494 return false;
3495
3496 // There are only 32-bit and 64-bit forms for 'andn'.
3497 if (VT != MVT::i32 && VT != MVT::i64)
3498 return false;
3499
3500 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3501}
3502
3504 EVT VT = Y.getValueType();
3505
3506 if (!VT.isVector())
3507 return hasAndNotCompare(Y);
3508
3509 // Vector.
3510
3511 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3512 return false;
3513
3514 if (VT == MVT::v4i32)
3515 return true;
3516
3517 return Subtarget.hasSSE2();
3518}
3519
3521 return X.getValueType().isScalarInteger(); // 'bt'
3522}
3523
3527 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3528 SelectionDAG &DAG) const {
3529 // Does baseline recommend not to perform the fold by default?
3531 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3532 return false;
3533 // For scalars this transform is always beneficial.
3534 if (X.getValueType().isScalarInteger())
3535 return true;
3536 // If all the shift amounts are identical, then transform is beneficial even
3537 // with rudimentary SSE2 shifts.
3538 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3539 return true;
3540 // If we have AVX2 with it's powerful shift operations, then it's also good.
3541 if (Subtarget.hasAVX2())
3542 return true;
3543 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3544 return NewShiftOpcode == ISD::SHL;
3545}
3546
3548 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3549 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3550 if (!VT.isInteger())
3551 return ShiftOpc;
3552
3553 bool PreferRotate = false;
3554 if (VT.isVector()) {
3555 // For vectors, if we have rotate instruction support, then its definetly
3556 // best. Otherwise its not clear what the best so just don't make changed.
3557 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3558 VT.getScalarType() == MVT::i64);
3559 } else {
3560 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3561 // rotate unless we have a zext mask+shr.
3562 PreferRotate = Subtarget.hasBMI2();
3563 if (!PreferRotate) {
3564 unsigned MaskBits =
3565 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3566 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3567 }
3568 }
3569
3570 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3571 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3572
3573 if (PreferRotate && MayTransformRotate)
3574 return ISD::ROTL;
3575
3576 // If vector we don't really get much benefit swapping around constants.
3577 // Maybe we could check if the DAG has the flipped node already in the
3578 // future.
3579 if (VT.isVector())
3580 return ShiftOpc;
3581
3582 // See if the beneficial to swap shift type.
3583 if (ShiftOpc == ISD::SHL) {
3584 // If the current setup has imm64 mask, then inverse will have
3585 // at least imm32 mask (or be zext i32 -> i64).
3586 if (VT == MVT::i64)
3587 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3588 : ShiftOpc;
3589
3590 // We can only benefit if req at least 7-bit for the mask. We
3591 // don't want to replace shl of 1,2,3 as they can be implemented
3592 // with lea/add.
3593 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3594 }
3595
3596 if (VT == MVT::i64)
3597 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3598 // extremely efficient.
3599 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3600
3601 // Keep small shifts as shl so we can generate add/lea.
3602 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3603 }
3604
3605 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3606 // (PreferRotate will be set in the latter case).
3607 if (PreferRotate || !MayTransformRotate || VT.isVector())
3608 return ShiftOpc;
3609
3610 // Non-vector type and we have a zext mask with SRL.
3611 return ISD::SRL;
3612}
3613
3616 const Value *Lhs,
3617 const Value *Rhs) const {
3618 using namespace llvm::PatternMatch;
3619 int BaseCost = BrMergingBaseCostThresh.getValue();
3620 // With CCMP, branches can be merged in a more efficient way.
3621 if (BaseCost >= 0 && Subtarget.hasCCMP())
3622 BaseCost += BrMergingCcmpBias;
3623 // a == b && a == c is a fast pattern on x86.
3624 if (BaseCost >= 0 && Opc == Instruction::And &&
3627 BaseCost += 1;
3628 return {BaseCost, BrMergingLikelyBias.getValue(),
3629 BrMergingUnlikelyBias.getValue()};
3630}
3631
3633 return N->getOpcode() != ISD::FP_EXTEND;
3634}
3635
3637 const SDNode *N, CombineLevel Level) const {
3638 assert(((N->getOpcode() == ISD::SHL &&
3639 N->getOperand(0).getOpcode() == ISD::SRL) ||
3640 (N->getOpcode() == ISD::SRL &&
3641 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3642 "Expected shift-shift mask");
3643 // TODO: Should we always create i64 masks? Or only folded immediates?
3644 EVT VT = N->getValueType(0);
3645 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3646 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3647 // Only fold if the shift values are equal - so it folds to AND.
3648 // TODO - we should fold if either is a non-uniform vector but we don't do
3649 // the fold for non-splats yet.
3650 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3651 }
3653}
3654
3656 EVT VT = Y.getValueType();
3657
3658 // For vectors, we don't have a preference, but we probably want a mask.
3659 if (VT.isVector())
3660 return false;
3661
3662 // 64-bit shifts on 32-bit targets produce really bad bloated code.
3663 if (VT == MVT::i64 && !Subtarget.is64Bit())
3664 return false;
3665
3666 return true;
3667}
3668
3671 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3673 !Subtarget.isOSWindows())
3676 ExpansionFactor);
3677}
3678
3680 // Any legal vector type can be splatted more efficiently than
3681 // loading/spilling from memory.
3682 return isTypeLegal(VT);
3683}
3684
3686 MVT VT = MVT::getIntegerVT(NumBits);
3687 if (isTypeLegal(VT))
3688 return VT;
3689
3690 // PMOVMSKB can handle this.
3691 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3692 return MVT::v16i8;
3693
3694 // VPMOVMSKB can handle this.
3695 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3696 return MVT::v32i8;
3697
3698 // TODO: Allow 64-bit type for 32-bit target.
3699 // TODO: 512-bit types should be allowed, but make sure that those
3700 // cases are handled in combineVectorSizedSetCCEquality().
3701
3703}
3704
3705/// Val is the undef sentinel value or equal to the specified value.
3706static bool isUndefOrEqual(int Val, int CmpVal) {
3707 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3708}
3709
3710/// Return true if every element in Mask is the undef sentinel value or equal to
3711/// the specified value.
3712static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3713 return llvm::all_of(Mask, [CmpVal](int M) {
3714 return (M == SM_SentinelUndef) || (M == CmpVal);
3715 });
3716}
3717
3718/// Return true if every element in Mask, beginning from position Pos and ending
3719/// in Pos+Size is the undef sentinel value or equal to the specified value.
3720static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3721 unsigned Size) {
3722 return llvm::all_of(Mask.slice(Pos, Size),
3723 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3724}
3725
3726/// Val is either the undef or zero sentinel value.
3727static bool isUndefOrZero(int Val) {
3728 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3729}
3730
3731/// Return true if every element in Mask, beginning from position Pos and ending
3732/// in Pos+Size is the undef sentinel value.
3733static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3734 return llvm::all_of(Mask.slice(Pos, Size),
3735 [](int M) { return M == SM_SentinelUndef; });
3736}
3737
3738/// Return true if the mask creates a vector whose lower half is undefined.
3740 unsigned NumElts = Mask.size();
3741 return isUndefInRange(Mask, 0, NumElts / 2);
3742}
3743
3744/// Return true if the mask creates a vector whose upper half is undefined.
3746 unsigned NumElts = Mask.size();
3747 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3748}
3749
3750/// Return true if Val falls within the specified range (L, H].
3751static bool isInRange(int Val, int Low, int Hi) {
3752 return (Val >= Low && Val < Hi);
3753}
3754
3755/// Return true if the value of any element in Mask falls within the specified
3756/// range (L, H].
3757static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3758 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3759}
3760
3761/// Return true if the value of any element in Mask is the zero sentinel value.
3762static bool isAnyZero(ArrayRef<int> Mask) {
3763 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3764}
3765
3766/// Return true if Val is undef or if its value falls within the
3767/// specified range (L, H].
3768static bool isUndefOrInRange(int Val, int Low, int Hi) {
3769 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3770}
3771
3772/// Return true if every element in Mask is undef or if its value
3773/// falls within the specified range (L, H].
3774static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3775 return llvm::all_of(
3776 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3777}
3778
3779/// Return true if Val is undef, zero or if its value falls within the
3780/// specified range (L, H].
3781static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3782 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3783}
3784
3785/// Return true if every element in Mask is undef, zero or if its value
3786/// falls within the specified range (L, H].
3787static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3788 return llvm::all_of(
3789 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3790}
3791
3792/// Return true if every element in Mask, is an in-place blend/select mask or is
3793/// undef.
3795 unsigned NumElts = Mask.size();
3796 for (auto [I, M] : enumerate(Mask))
3797 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3798 return false;
3799 return true;
3800}
3801
3802/// Return true if every element in Mask, beginning
3803/// from position Pos and ending in Pos + Size, falls within the specified
3804/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3805static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3806 unsigned Size, int Low, int Step = 1) {
3807 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3808 if (!isUndefOrEqual(Mask[i], Low))
3809 return false;
3810 return true;
3811}
3812
3813/// Return true if every element in Mask, beginning
3814/// from position Pos and ending in Pos+Size, falls within the specified
3815/// sequential range (Low, Low+Size], or is undef or is zero.
3817 unsigned Size, int Low,
3818 int Step = 1) {
3819 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3820 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3821 return false;
3822 return true;
3823}
3824
3825/// Return true if every element in Mask, beginning
3826/// from position Pos and ending in Pos+Size is undef or is zero.
3827static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3828 unsigned Size) {
3829 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3830}
3831
3832/// Return true if every element of a single input is referenced by the shuffle
3833/// mask. i.e. it just permutes them all.
3835 unsigned NumElts = Mask.size();
3836 APInt DemandedElts = APInt::getZero(NumElts);
3837 for (int M : Mask)
3838 if (isInRange(M, 0, NumElts))
3839 DemandedElts.setBit(M);
3840 return DemandedElts.isAllOnes();
3841}
3842
3843/// Helper function to test whether a shuffle mask could be
3844/// simplified by widening the elements being shuffled.
3845///
3846/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3847/// leaves it in an unspecified state.
3848///
3849/// NOTE: This must handle normal vector shuffle masks and *target* vector
3850/// shuffle masks. The latter have the special property of a '-2' representing
3851/// a zero-ed lane of a vector.
3853 SmallVectorImpl<int> &WidenedMask) {
3854 WidenedMask.assign(Mask.size() / 2, 0);
3855 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3856 int M0 = Mask[i];
3857 int M1 = Mask[i + 1];
3858
3859 // If both elements are undef, its trivial.
3860 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3861 WidenedMask[i / 2] = SM_SentinelUndef;
3862 continue;
3863 }
3864
3865 // Check for an undef mask and a mask value properly aligned to fit with
3866 // a pair of values. If we find such a case, use the non-undef mask's value.
3867 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3868 WidenedMask[i / 2] = M1 / 2;
3869 continue;
3870 }
3871 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3872 WidenedMask[i / 2] = M0 / 2;
3873 continue;
3874 }
3875
3876 // When zeroing, we need to spread the zeroing across both lanes to widen.
3877 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3878 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3880 WidenedMask[i / 2] = SM_SentinelZero;
3881 continue;
3882 }
3883 return false;
3884 }
3885
3886 // Finally check if the two mask values are adjacent and aligned with
3887 // a pair.
3888 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3889 WidenedMask[i / 2] = M0 / 2;
3890 continue;
3891 }
3892
3893 // Otherwise we can't safely widen the elements used in this shuffle.
3894 return false;
3895 }
3896 assert(WidenedMask.size() == Mask.size() / 2 &&
3897 "Incorrect size of mask after widening the elements!");
3898
3899 return true;
3900}
3901
3903 const APInt &Zeroable,
3904 bool V2IsZero,
3905 SmallVectorImpl<int> &WidenedMask) {
3906 // Create an alternative mask with info about zeroable elements.
3907 // Here we do not set undef elements as zeroable.
3908 SmallVector<int, 64> ZeroableMask(Mask);
3909 if (V2IsZero) {
3910 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3911 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3912 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3913 ZeroableMask[i] = SM_SentinelZero;
3914 }
3915 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3916}
3917
3919 SmallVector<int, 32> WidenedMask;
3920 return canWidenShuffleElements(Mask, WidenedMask);
3921}
3922
3923// Attempt to narrow/widen shuffle mask until it matches the target number of
3924// elements.
3925static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3926 SmallVectorImpl<int> &ScaledMask) {
3927 unsigned NumSrcElts = Mask.size();
3928 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3929 "Illegal shuffle scale factor");
3930
3931 // Narrowing is guaranteed to work.
3932 if (NumDstElts >= NumSrcElts) {
3933 int Scale = NumDstElts / NumSrcElts;
3934 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3935 return true;
3936 }
3937
3938 // We have to repeat the widening until we reach the target size, but we can
3939 // split out the first widening as it sets up ScaledMask for us.
3940 if (canWidenShuffleElements(Mask, ScaledMask)) {
3941 while (ScaledMask.size() > NumDstElts) {
3942 SmallVector<int, 16> WidenedMask;
3943 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3944 return false;
3945 ScaledMask = std::move(WidenedMask);
3946 }
3947 return true;
3948 }
3949
3950 return false;
3951}
3952
3953static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
3954 SmallVector<int, 32> ScaledMask;
3955 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
3956}
3957
3958// Helper to grow the shuffle mask for a larger value type.
3959// NOTE: This is different to scaleShuffleElements which is a same size type.
3960static void growShuffleMask(ArrayRef<int> SrcMask,
3961 SmallVectorImpl<int> &DstMask,
3962 unsigned SrcSizeInBits, unsigned DstSizeInBits) {
3963 assert(DstMask.empty() && "Expected an empty shuffle mas");
3964 assert((DstSizeInBits % SrcSizeInBits) == 0 && "Illegal shuffle scale");
3965 unsigned Scale = DstSizeInBits / SrcSizeInBits;
3966 unsigned NumSrcElts = SrcMask.size();
3967 DstMask.assign(SrcMask.begin(), SrcMask.end());
3968 for (int &M : DstMask) {
3969 if (M < 0)
3970 continue;
3971 M = (M % NumSrcElts) + ((M / NumSrcElts) * Scale * NumSrcElts);
3972 }
3973 DstMask.append((Scale - 1) * NumSrcElts, SM_SentinelUndef);
3974}
3975
3976/// Returns true if Elt is a constant zero or a floating point constant +0.0.
3978 return isNullConstant(Elt) || isNullFPConstant(Elt);
3979}
3980
3981// Build a vector of constants.
3982// Use an UNDEF node if MaskElt == -1.
3983// Split 64-bit constants in the 32-bit mode.
3985 const SDLoc &dl, bool IsMask = false) {
3986
3988 bool Split = false;
3989
3990 MVT ConstVecVT = VT;
3991 unsigned NumElts = VT.getVectorNumElements();
3992 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3993 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3994 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3995 Split = true;
3996 }
3997
3998 MVT EltVT = ConstVecVT.getVectorElementType();
3999 for (unsigned i = 0; i < NumElts; ++i) {
4000 bool IsUndef = Values[i] < 0 && IsMask;
4001 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4002 DAG.getConstant(Values[i], dl, EltVT);
4003 Ops.push_back(OpNode);
4004 if (Split)
4005 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4006 DAG.getConstant(0, dl, EltVT));
4007 }
4008 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4009 if (Split)
4010 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4011 return ConstsNode;
4012}
4013
4014static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
4015 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4016 assert(Bits.size() == Undefs.getBitWidth() &&
4017 "Unequal constant and undef arrays");
4019 bool Split = false;
4020
4021 MVT ConstVecVT = VT;
4022 unsigned NumElts = VT.getVectorNumElements();
4023 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4024 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4025 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4026 Split = true;
4027 }
4028
4029 MVT EltVT = ConstVecVT.getVectorElementType();
4030 MVT EltIntVT = EltVT.changeTypeToInteger();
4031 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4032 if (Undefs[i]) {
4033 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4034 continue;
4035 }
4036 const APInt &V = Bits[i];
4037 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4038 if (Split) {
4039 Ops.push_back(DAG.getConstant(V.extractBits(32, 0), dl, EltVT));
4040 Ops.push_back(DAG.getConstant(V.extractBits(32, 32), dl, EltVT));
4041 } else {
4042 Ops.push_back(DAG.getBitcast(EltVT, DAG.getConstant(V, dl, EltIntVT)));
4043 }
4044 }
4045
4046 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4047 return DAG.getBitcast(VT, ConstsNode);
4048}
4049
4051 SelectionDAG &DAG, const SDLoc &dl) {
4052 APInt Undefs = APInt::getZero(Bits.size());
4053 return getConstVector(Bits, Undefs, VT, DAG, dl);
4054}
4055
4056/// Returns a vector of specified type with all zero elements.
4057static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4058 SelectionDAG &DAG, const SDLoc &dl) {
4059 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4060 VT.getVectorElementType() == MVT::i1) &&
4061 "Unexpected vector type");
4062
4063 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4064 // type. This ensures they get CSE'd. But if the integer type is not
4065 // available, use a floating-point +0.0 instead.
4066 SDValue Vec;
4067 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4068 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4069 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4070 } else if (VT.isFloatingPoint() &&
4072 Vec = DAG.getConstantFP(+0.0, dl, VT);
4073 } else if (VT.getVectorElementType() == MVT::i1) {
4074 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4075 "Unexpected vector type");
4076 Vec = DAG.getConstant(0, dl, VT);
4077 } else {
4078 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4079 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4080 }
4081 return DAG.getBitcast(VT, Vec);
4082}
4083
4084// Helper to determine if the ops are all the extracted subvectors come from a
4085// single source. If we allow commute they don't have to be in order (Lo/Hi).
4086static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
4087 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4088 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4089 LHS.getValueType() != RHS.getValueType() ||
4090 LHS.getOperand(0) != RHS.getOperand(0))
4091 return SDValue();
4092
4093 SDValue Src = LHS.getOperand(0);
4094 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
4095 return SDValue();
4096
4097 unsigned NumElts = LHS.getValueType().getVectorNumElements();
4098 if ((LHS.getConstantOperandAPInt(1) == 0 &&
4099 RHS.getConstantOperandAPInt(1) == NumElts) ||
4100 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
4101 LHS.getConstantOperandAPInt(1) == NumElts))
4102 return Src;
4103
4104 return SDValue();
4105}
4106
4107static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4108 const SDLoc &dl, unsigned vectorWidth) {
4109 EVT VT = Vec.getValueType();
4110 EVT ElVT = VT.getVectorElementType();
4111 unsigned ResultNumElts =
4112 (VT.getVectorNumElements() * vectorWidth) / VT.getSizeInBits();
4113 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, ResultNumElts);
4114
4115 assert(ResultVT.getSizeInBits() == vectorWidth &&
4116 "Illegal subvector extraction");
4117
4118 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4119 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4120 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4121
4122 // This is the index of the first element of the vectorWidth-bit chunk
4123 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4124 IdxVal &= ~(ElemsPerChunk - 1);
4125
4126 // If the input is a buildvector just emit a smaller one.
4127 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4128 return DAG.getBuildVector(ResultVT, dl,
4129 Vec->ops().slice(IdxVal, ElemsPerChunk));
4130
4131 // Check if we're extracting the upper undef of a widening pattern.
4132 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
4133 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
4134 isNullConstant(Vec.getOperand(2)))
4135 return DAG.getUNDEF(ResultVT);
4136
4137 return DAG.getExtractSubvector(dl, ResultVT, Vec, IdxVal);
4138}
4139
4140/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4141/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4142/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4143/// instructions or a simple subregister reference. Idx is an index in the
4144/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4145/// lowering EXTRACT_VECTOR_ELT operations easier.
4146static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4147 SelectionDAG &DAG, const SDLoc &dl) {
4149 Vec.getValueType().is512BitVector()) &&
4150 "Unexpected vector size!");
4151 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4152}
4153
4154/// Generate a DAG to grab 256-bits from a 512-bit vector.
4155static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4156 SelectionDAG &DAG, const SDLoc &dl) {
4157 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4158 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4159}
4160
4161static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4162 SelectionDAG &DAG, const SDLoc &dl,
4163 unsigned vectorWidth) {
4164 assert((vectorWidth == 128 || vectorWidth == 256) &&
4165 "Unsupported vector width");
4166 // Inserting UNDEF is Result
4167 if (Vec.isUndef())
4168 return Result;
4169
4170 // Insert the relevant vectorWidth bits.
4171 EVT VT = Vec.getValueType();
4172 unsigned ElemsPerChunk = vectorWidth / VT.getScalarSizeInBits();
4173 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4174
4175 // This is the index of the first element of the vectorWidth-bit chunk
4176 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4177 IdxVal &= ~(ElemsPerChunk - 1);
4178 return DAG.getInsertSubvector(dl, Result, Vec, IdxVal);
4179}
4180
4181/// Generate a DAG to put 128-bits into a vector > 128 bits. This
4182/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4183/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4184/// simple superregister reference. Idx is an index in the 128 bits
4185/// we want. It need not be aligned to a 128-bit boundary. That makes
4186/// lowering INSERT_VECTOR_ELT operations easier.
4187static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4188 SelectionDAG &DAG, const SDLoc &dl) {
4189 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4190 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4191}
4192
4193/// Widen a vector to a larger size with the same scalar type, with the new
4194/// elements either zero or undef.
4195static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
4196 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4197 const SDLoc &dl) {
4198 EVT VecVT = Vec.getValueType();
4200 VecVT.getScalarType() == VT.getScalarType() &&
4201 "Unsupported vector widening type");
4202 // If the upper 128-bits of a build vector are already undef/zero, then try to
4203 // widen from the lower 128-bits.
4204 if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {
4205 unsigned NumSrcElts = VecVT.getVectorNumElements();
4206 ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4207 if (all_of(Hi, [&](SDValue V) {
4208 return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));
4209 }))
4210 Vec = extract128BitVector(Vec, 0, DAG, dl);
4211 }
4212 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4213 : DAG.getUNDEF(VT);
4214 return DAG.getInsertSubvector(dl, Res, Vec, 0);
4215}
4216
4217/// Widen a vector to a larger size with the same scalar type, with the new
4218/// elements either zero or undef.
4219static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4220 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4221 const SDLoc &dl, unsigned WideSizeInBits) {
4222 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4223 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4224 "Unsupported vector widening type");
4225 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4226 MVT SVT = Vec.getSimpleValueType().getScalarType();
4227 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4228 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4229}
4230
4231/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4232/// and bitcast with integer types.
4233static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4234 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4235 unsigned NumElts = VT.getVectorNumElements();
4236 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4237 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4238 return VT;
4239}
4240
4241/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4242/// bitcast with integer types.
4243static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4244 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4245 const SDLoc &dl) {
4246 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4247 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4248}
4249
4250// Helper function to collect subvector ops that are concatenated together,
4251// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4252// The subvectors in Ops are guaranteed to be the same type.
4254 SelectionDAG &DAG) {
4255 assert(Ops.empty() && "Expected an empty ops vector");
4256
4257 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4258 Ops.append(N->op_begin(), N->op_end());
4259 return true;
4260 }
4261
4262 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4263 SDValue Src = N->getOperand(0);
4264 SDValue Sub = N->getOperand(1);
4265 const APInt &Idx = N->getConstantOperandAPInt(2);
4266 EVT VT = Src.getValueType();
4267 EVT SubVT = Sub.getValueType();
4268
4269 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4270 // insert_subvector(undef, x, lo)
4271 if (Idx == 0 && Src.isUndef()) {
4272 Ops.push_back(Sub);
4273 Ops.push_back(DAG.getUNDEF(SubVT));
4274 return true;
4275 }
4276 if (Idx == (VT.getVectorNumElements() / 2)) {
4277 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4278 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4279 Src.getOperand(1).getValueType() == SubVT &&
4280 isNullConstant(Src.getOperand(2))) {
4281 // Attempt to recurse into inner (matching) concats.
4282 SDValue Lo = Src.getOperand(1);
4283 SDValue Hi = Sub;
4284 SmallVector<SDValue, 2> LoOps, HiOps;
4285 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4286 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4287 LoOps.size() == HiOps.size()) {
4288 Ops.append(LoOps);
4289 Ops.append(HiOps);
4290 return true;
4291 }
4292 Ops.push_back(Lo);
4293 Ops.push_back(Hi);
4294 return true;
4295 }
4296 // insert_subvector(x, extract_subvector(x, lo), hi)
4297 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4298 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4299 Ops.append(2, Sub);
4300 return true;
4301 }
4302 // insert_subvector(undef, x, hi)
4303 if (Src.isUndef()) {
4304 Ops.push_back(DAG.getUNDEF(SubVT));
4305 Ops.push_back(Sub);
4306 return true;
4307 }
4308 }
4309 }
4310 }
4311
4312 if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4313 EVT VT = N->getValueType(0);
4314 SDValue Src = N->getOperand(0);
4315 uint64_t Idx = N->getConstantOperandVal(1);
4316
4317 // Collect all the subvectors from the source vector and slice off the
4318 // extraction.
4320 if (collectConcatOps(Src.getNode(), SrcOps, DAG) &&
4321 VT.getSizeInBits() > SrcOps[0].getValueSizeInBits() &&
4322 (VT.getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 &&
4323 (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) {
4324 unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements();
4325 unsigned NumSubs = VT.getSizeInBits() / SrcOps[0].getValueSizeInBits();
4326 Ops.append(SrcOps.begin() + SubIdx, SrcOps.begin() + SubIdx + NumSubs);
4327 return true;
4328 }
4329 }
4330
4331 assert(Ops.empty() && "Expected an empty ops vector");
4332 return false;
4333}
4334
4335// Helper to check if \p V can be split into subvectors and the upper subvectors
4336// are all undef. In which case return the lower subvector.
4338 SelectionDAG &DAG) {
4339 SmallVector<SDValue> SubOps;
4340 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4341 return SDValue();
4342
4343 unsigned NumSubOps = SubOps.size();
4344 unsigned HalfNumSubOps = NumSubOps / 2;
4345 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4346
4347 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4348 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4349 return SDValue();
4350
4351 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4352 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4353 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4354}
4355
4356// Helper to check if we can access all the constituent subvectors without any
4357// extract ops.
4360 return collectConcatOps(V.getNode(), Ops, DAG);
4361}
4362
4363static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4364 const SDLoc &dl) {
4365 EVT VT = Op.getValueType();
4366 unsigned NumElems = VT.getVectorNumElements();
4367 unsigned SizeInBits = VT.getSizeInBits();
4368 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4369 "Can't split odd sized vector");
4370
4372 if (collectConcatOps(Op.getNode(), SubOps, DAG)) {
4373 assert((SubOps.size() % 2) == 0 && "Can't split odd sized vector concat");
4374 unsigned HalfOps = SubOps.size() / 2;
4375 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
4376 SmallVector<SDValue, 2> LoOps(SubOps.begin(), SubOps.begin() + HalfOps);
4377 SmallVector<SDValue, 2> HiOps(SubOps.begin() + HalfOps, SubOps.end());
4378 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps);
4379 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps);
4380 return std::make_pair(Lo, Hi);
4381 }
4382
4383 // If this is a splat value (with no-undefs) then use the lower subvector,
4384 // which should be a free extraction.
4385 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4386 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4387 return std::make_pair(Lo, Lo);
4388
4389 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4390 return std::make_pair(Lo, Hi);
4391}
4392
4393/// Break an operation into 2 half sized ops and then concatenate the results.
4395 unsigned NumOps = Op.getNumOperands();
4396 EVT VT = Op.getValueType();
4397
4398 // Extract the LHS Lo/Hi vectors
4401 for (unsigned I = 0; I != NumOps; ++I) {
4402 SDValue SrcOp = Op.getOperand(I);
4403 if (!SrcOp.getValueType().isVector()) {
4404 LoOps[I] = HiOps[I] = SrcOp;
4405 continue;
4406 }
4407 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4408 }
4409
4410 EVT LoVT, HiVT;
4411 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4412 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4413 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4414 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4415}
4416
4417/// Break an unary integer operation into 2 half sized ops and then
4418/// concatenate the result back.
4420 const SDLoc &dl) {
4421 // Make sure we only try to split 256/512-bit types to avoid creating
4422 // narrow vectors.
4423 [[maybe_unused]] EVT VT = Op.getValueType();
4424 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4425 Op.getOperand(0).getValueType().is512BitVector()) &&
4426 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4427 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4428 VT.getVectorNumElements() &&
4429 "Unexpected VTs!");
4430 return splitVectorOp(Op, DAG, dl);
4431}
4432
4433/// Break a binary integer operation into 2 half sized ops and then
4434/// concatenate the result back.
4436 const SDLoc &dl) {
4437 // Assert that all the types match.
4438 [[maybe_unused]] EVT VT = Op.getValueType();
4439 assert(Op.getOperand(0).getValueType() == VT &&
4440 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4441 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4442 return splitVectorOp(Op, DAG, dl);
4443}
4444
4445// Helper for splitting operands of an operation to legal target size and
4446// apply a function on each part.
4447// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4448// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4449// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4450// The argument Builder is a function that will be applied on each split part:
4451// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4452template <typename F>
4454 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4455 F Builder, bool CheckBWI = true) {
4456 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4457 unsigned NumSubs = 1;
4458 if ((CheckBWI && Subtarget.useBWIRegs()) ||
4459 (!CheckBWI && Subtarget.useAVX512Regs())) {
4460 if (VT.getSizeInBits() > 512) {
4461 NumSubs = VT.getSizeInBits() / 512;
4462 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4463 }
4464 } else if (Subtarget.hasAVX2()) {
4465 if (VT.getSizeInBits() > 256) {
4466 NumSubs = VT.getSizeInBits() / 256;
4467 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4468 }
4469 } else {
4470 if (VT.getSizeInBits() > 128) {
4471 NumSubs = VT.getSizeInBits() / 128;
4472 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4473 }
4474 }
4475
4476 if (NumSubs == 1)
4477 return Builder(DAG, DL, Ops);
4478
4480 for (unsigned i = 0; i != NumSubs; ++i) {
4482 for (SDValue Op : Ops) {
4483 EVT OpVT = Op.getValueType();
4484 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4485 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4486 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4487 }
4488 Subs.push_back(Builder(DAG, DL, SubOps));
4489 }
4490 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4491}
4492
4493// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4494// targets.
4495static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4497 const X86Subtarget &Subtarget) {
4498 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4499 MVT SVT = VT.getScalarType();
4500
4501 // If we have a 32/64 splatted constant, splat it to DstTy to
4502 // encourage a foldable broadcast'd operand.
4503 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4504 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4505 // AVX512 broadcasts 32/64-bit operands.
4506 // TODO: Support float once getAVX512Node is used by fp-ops.
4507 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4509 return SDValue();
4510 // If we're not widening, don't bother if we're not bitcasting.
4511 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4512 return SDValue();
4514 APInt SplatValue, SplatUndef;
4515 unsigned SplatBitSize;
4516 bool HasAnyUndefs;
4517 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4518 HasAnyUndefs, OpEltSizeInBits) &&
4519 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4520 return DAG.getConstant(SplatValue, DL, DstVT);
4521 }
4522 return SDValue();
4523 };
4524
4525 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4526
4527 MVT DstVT = VT;
4528 if (Widen)
4529 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4530
4531 // Canonicalize src operands.
4532 SmallVector<SDValue> SrcOps(Ops);
4533 for (SDValue &Op : SrcOps) {
4534 MVT OpVT = Op.getSimpleValueType();
4535 // Just pass through scalar operands.
4536 if (!OpVT.isVector())
4537 continue;
4538 assert(OpVT == VT && "Vector type mismatch");
4539
4540 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4541 Op = BroadcastOp;
4542 continue;
4543 }
4544
4545 // Just widen the subvector by inserting into an undef wide vector.
4546 if (Widen)
4547 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4548 }
4549
4550 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4551
4552 // Perform the 512-bit op then extract the bottom subvector.
4553 if (Widen)
4554 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4555 return Res;
4556}
4557
4558/// Insert i1-subvector to i1-vector.
4560 const X86Subtarget &Subtarget) {
4561
4562 SDLoc dl(Op);
4563 SDValue Vec = Op.getOperand(0);
4564 SDValue SubVec = Op.getOperand(1);
4565 SDValue Idx = Op.getOperand(2);
4566 unsigned IdxVal = Op.getConstantOperandVal(2);
4567
4568 // Inserting undef is a nop. We can just return the original vector.
4569 if (SubVec.isUndef())
4570 return Vec;
4571
4572 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4573 return Op;
4574
4575 MVT OpVT = Op.getSimpleValueType();
4576 unsigned NumElems = OpVT.getVectorNumElements();
4577 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
4578
4579 // Extend to natively supported kshift.
4580 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4581
4582 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4583 // if necessary.
4584 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4585 // May need to promote to a legal type.
4586 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4587 DAG.getConstant(0, dl, WideOpVT),
4588 SubVec, Idx);
4589 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4590 }
4591
4592 MVT SubVecVT = SubVec.getSimpleValueType();
4593 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4594 assert(IdxVal + SubVecNumElems <= NumElems &&
4595 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4596 "Unexpected index value in INSERT_SUBVECTOR");
4597
4598 SDValue Undef = DAG.getUNDEF(WideOpVT);
4599
4600 if (IdxVal == 0) {
4601 // Zero lower bits of the Vec
4602 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4603 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4604 ZeroIdx);
4605 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4606 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4607 // Merge them together, SubVec should be zero extended.
4608 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4609 DAG.getConstant(0, dl, WideOpVT),
4610 SubVec, ZeroIdx);
4611 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4612 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4613 }
4614
4615 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4616 Undef, SubVec, ZeroIdx);
4617
4618 if (Vec.isUndef()) {
4619 assert(IdxVal != 0 && "Unexpected index");
4620 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4621 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4622 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4623 }
4624
4626 assert(IdxVal != 0 && "Unexpected index");
4627 // If upper elements of Vec are known undef, then just shift into place.
4628 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4629 [](SDValue V) { return V.isUndef(); })) {
4630 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4631 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4632 } else {
4633 NumElems = WideOpVT.getVectorNumElements();
4634 unsigned ShiftLeft = NumElems - SubVecNumElems;
4635 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4636 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4637 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4638 if (ShiftRight != 0)
4639 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4640 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4641 }
4642 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4643 }
4644
4645 // Simple case when we put subvector in the upper part
4646 if (IdxVal + SubVecNumElems == NumElems) {
4647 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4648 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4649 if (SubVecNumElems * 2 == NumElems) {
4650 // Special case, use legal zero extending insert_subvector. This allows
4651 // isel to optimize when bits are known zero.
4652 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4653 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4654 DAG.getConstant(0, dl, WideOpVT),
4655 Vec, ZeroIdx);
4656 } else {
4657 // Otherwise use explicit shifts to zero the bits.
4658 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4659 Undef, Vec, ZeroIdx);
4660 NumElems = WideOpVT.getVectorNumElements();
4661 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4662 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4663 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4664 }
4665 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4666 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4667 }
4668
4669 // Inserting into the middle is more complicated.
4670
4671 NumElems = WideOpVT.getVectorNumElements();
4672
4673 // Widen the vector if needed.
4674 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4675
4676 unsigned ShiftLeft = NumElems - SubVecNumElems;
4677 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4678
4679 // Do an optimization for the most frequently used types.
4680 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4681 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4682 Mask0.flipAllBits();
4683 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4684 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4685 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4686 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4687 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4688 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4689 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4690 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4691
4692 // Reduce to original width if needed.
4693 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4694 }
4695
4696 // Clear the upper bits of the subvector and move it to its insert position.
4697 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4698 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4699 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4700 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4701
4702 // Isolate the bits below the insertion point.
4703 unsigned LowShift = NumElems - IdxVal;
4704 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4705 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4706 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4707 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4708
4709 // Isolate the bits after the last inserted bit.
4710 unsigned HighShift = IdxVal + SubVecNumElems;
4711 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4712 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4713 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4714 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4715
4716 // Now OR all 3 pieces together.
4717 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4718 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4719
4720 // Reduce to original width if needed.
4721 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4722}
4723
4725 const SDLoc &dl) {
4726 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4727 EVT SubVT = V1.getValueType();
4728 EVT SubSVT = SubVT.getScalarType();
4729 unsigned SubNumElts = SubVT.getVectorNumElements();
4730 unsigned SubVectorWidth = SubVT.getSizeInBits();
4731 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4732 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4733 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4734}
4735
4736/// Returns a vector of specified type with all bits set.
4737/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4738/// Then bitcast to their original type, ensuring they get CSE'd.
4739static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4740 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4741 "Expected a 128/256/512-bit vector type");
4742 unsigned NumElts = VT.getSizeInBits() / 32;
4743 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4744 return DAG.getBitcast(VT, Vec);
4745}
4746
4747static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4748 SDValue In, SelectionDAG &DAG) {
4749 EVT InVT = In.getValueType();
4750 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4751
4752 // Canonicalize Opcode to general extension version.
4753 switch (Opcode) {
4754 case ISD::ANY_EXTEND:
4756 Opcode = ISD::ANY_EXTEND;
4757 break;
4758 case ISD::SIGN_EXTEND:
4760 Opcode = ISD::SIGN_EXTEND;
4761 break;
4762 case ISD::ZERO_EXTEND:
4764 Opcode = ISD::ZERO_EXTEND;
4765 break;
4766 default:
4767 llvm_unreachable("Unknown extension opcode");
4768 }
4769
4770 // For 256-bit vectors, we only need the lower (128-bit) input half.
4771 // For 512-bit vectors, we only need the lower input half or quarter.
4772 if (InVT.getSizeInBits() > 128) {
4773 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4774 "Expected VTs to be the same size!");
4775 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4776 In = extractSubVector(In, 0, DAG, DL,
4777 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4778 InVT = In.getValueType();
4779 }
4780
4781 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4782 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4783
4784 return DAG.getNode(Opcode, DL, VT, In);
4785}
4786
4787// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4789 SDValue Mask, SelectionDAG &DAG) {
4790 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4791 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4792 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4793}
4794
4796 bool Lo, bool Unary) {
4797 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4798 "Illegal vector type to unpack");
4799 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4800 int NumElts = VT.getVectorNumElements();
4801 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4802 for (int i = 0; i < NumElts; ++i) {
4803 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4804 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4805 Pos += (Unary ? 0 : NumElts * (i % 2));
4806 Pos += (Lo ? 0 : NumEltsInLane / 2);
4807 Mask.push_back(Pos);
4808 }
4809}
4810
4811/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4812/// imposed by AVX and specific to the unary pattern. Example:
4813/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4814/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4816 bool Lo) {
4817 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4818 int NumElts = VT.getVectorNumElements();
4819 for (int i = 0; i < NumElts; ++i) {
4820 int Pos = i / 2;
4821 Pos += (Lo ? 0 : NumElts / 2);
4822 Mask.push_back(Pos);
4823 }
4824}
4825
4826// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4827static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4828 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4831 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4832 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4833 int M = Mask[I];
4834 if (M < 0)
4835 continue;
4836 SDValue V = (M < NumElts) ? V1 : V2;
4837 if (V.isUndef())
4838 continue;
4839 Ops[I] = V.getOperand(M % NumElts);
4840 }
4841 return DAG.getBuildVector(VT, dl, Ops);
4842 }
4843
4844 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4845}
4846
4847/// Returns a vector_shuffle node for an unpackl operation.
4848static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4849 SDValue V1, SDValue V2) {
4851 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4852 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4853}
4854
4855/// Returns a vector_shuffle node for an unpackh operation.
4856static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4857 SDValue V1, SDValue V2) {
4859 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4860 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4861}
4862
4863/// Returns a node that packs the LHS + RHS nodes together at half width.
4864/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4865/// TODO: Add subvector splitting if/when we have a need for it.
4866static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4867 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4868 bool PackHiHalf = false) {
4869 MVT OpVT = LHS.getSimpleValueType();
4870 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4871 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4872 assert(OpVT == RHS.getSimpleValueType() &&
4873 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4874 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4875 "Unexpected PACK operand types");
4876 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4877 "Unexpected PACK result type");
4878
4879 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4880 if (EltSizeInBits == 32) {
4881 SmallVector<int> PackMask;
4882 int Offset = PackHiHalf ? 1 : 0;
4883 int NumElts = VT.getVectorNumElements();
4884 for (int I = 0; I != NumElts; I += 4) {
4885 PackMask.push_back(I + Offset);
4886 PackMask.push_back(I + Offset + 2);
4887 PackMask.push_back(I + Offset + NumElts);
4888 PackMask.push_back(I + Offset + NumElts + 2);
4889 }
4890 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4891 DAG.getBitcast(VT, RHS), PackMask);
4892 }
4893
4894 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4895 if (!PackHiHalf) {
4896 if (UsePackUS &&
4897 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4898 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4899 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4900
4901 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4902 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4903 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4904 }
4905
4906 // Fallback to sign/zero extending the requested half and pack.
4907 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4908 if (UsePackUS) {
4909 if (PackHiHalf) {
4910 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4911 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4912 } else {
4913 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4914 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4915 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4916 };
4917 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4918 };
4919
4920 if (!PackHiHalf) {
4921 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4922 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4923 }
4924 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4925 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4926 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4927}
4928
4929/// Return a vector_shuffle of the specified vector of zero or undef vector.
4930/// This produces a shuffle where the low element of V2 is swizzled into the
4931/// zero/undef vector, landing at element Idx.
4932/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4934 bool IsZero,
4935 const X86Subtarget &Subtarget,
4936 SelectionDAG &DAG) {
4937 MVT VT = V2.getSimpleValueType();
4938 SDValue V1 = IsZero
4939 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4940 int NumElems = VT.getVectorNumElements();
4941 SmallVector<int, 16> MaskVec(NumElems);
4942 for (int i = 0; i != NumElems; ++i)
4943 // If this is the insertion idx, put the low elt of V2 here.
4944 MaskVec[i] = (i == Idx) ? NumElems : i;
4945 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4946}
4947
4949 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4950 Ptr.getOpcode() == X86ISD::WrapperRIP)
4951 Ptr = Ptr.getOperand(0);
4953}
4954
4955// TODO: Add support for non-zero offsets.
4958 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4959 return nullptr;
4960 return CNode->getConstVal();
4961}
4962
4964 if (!Load || !ISD::isNormalLoad(Load))
4965 return nullptr;
4966 return getTargetConstantFromBasePtr(Load->getBasePtr());
4967}
4968
4973
4974const Constant *
4976 assert(LD && "Unexpected null LoadSDNode");
4977 return getTargetConstantFromNode(LD);
4978}
4979
4981 // Do not fold (vselect not(C), X, 0s) to (vselect C, Os, X)
4982 SDValue Cond = N->getOperand(0);
4983 SDValue RHS = N->getOperand(2);
4984 EVT CondVT = Cond.getValueType();
4985 return N->getOpcode() == ISD::VSELECT && Subtarget.hasAVX512() &&
4986 CondVT.getVectorElementType() == MVT::i1 &&
4987 ISD::isBuildVectorAllZeros(RHS.getNode());
4988}
4989
4990// Extract raw constant bits from constant pools.
4991static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4992 APInt &UndefElts,
4993 SmallVectorImpl<APInt> &EltBits,
4994 bool AllowWholeUndefs = true,
4995 bool AllowPartialUndefs = false) {
4996 assert(EltBits.empty() && "Expected an empty EltBits vector");
4997
4999
5000 EVT VT = Op.getValueType();
5001 unsigned SizeInBits = VT.getSizeInBits();
5002 unsigned NumElts = SizeInBits / EltSizeInBits;
5003
5004 // Can't split constant.
5005 if ((SizeInBits % EltSizeInBits) != 0)
5006 return false;
5007
5008 // Bitcast a source array of element bits to the target size.
5009 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5010 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5011 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5012 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5013 "Constant bit sizes don't match");
5014
5015 // Don't split if we don't allow undef bits.
5016 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5017 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5018 return false;
5019
5020 // If we're already the right size, don't bother bitcasting.
5021 if (NumSrcElts == NumElts) {
5022 UndefElts = UndefSrcElts;
5023 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5024 return true;
5025 }
5026
5027 // Extract all the undef/constant element data and pack into single bitsets.
5028 APInt UndefBits(SizeInBits, 0);
5029 APInt MaskBits(SizeInBits, 0);
5030
5031 for (unsigned i = 0; i != NumSrcElts; ++i) {
5032 unsigned BitOffset = i * SrcEltSizeInBits;
5033 if (UndefSrcElts[i])
5034 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5035 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5036 }
5037
5038 // Split the undef/constant single bitset data into the target elements.
5039 UndefElts = APInt(NumElts, 0);
5040 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5041
5042 for (unsigned i = 0; i != NumElts; ++i) {
5043 unsigned BitOffset = i * EltSizeInBits;
5044 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5045
5046 // Only treat an element as UNDEF if all bits are UNDEF.
5047 if (UndefEltBits.isAllOnes()) {
5048 if (!AllowWholeUndefs)
5049 return false;
5050 UndefElts.setBit(i);
5051 continue;
5052 }
5053
5054 // If only some bits are UNDEF then treat them as zero (or bail if not
5055 // supported).
5056 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5057 return false;
5058
5059 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
5060 }
5061 return true;
5062 };
5063
5064 // Collect constant bits and insert into mask/undef bit masks.
5065 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5066 unsigned UndefBitIndex) {
5067 if (!Cst)
5068 return false;
5069 if (isa<UndefValue>(Cst)) {
5070 Undefs.setBit(UndefBitIndex);
5071 return true;
5072 }
5073 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5074 Mask = CInt->getValue();
5075 return true;
5076 }
5077 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5078 Mask = CFP->getValueAPF().bitcastToAPInt();
5079 return true;
5080 }
5081 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
5082 Type *Ty = CDS->getType();
5083 Mask = APInt::getZero(Ty->getPrimitiveSizeInBits());
5084 Type *EltTy = CDS->getElementType();
5085 bool IsInteger = EltTy->isIntegerTy();
5086 bool IsFP =
5087 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
5088 if (!IsInteger && !IsFP)
5089 return false;
5090 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
5091 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
5092 if (IsInteger)
5093 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
5094 else
5095 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
5096 I * EltBits);
5097 return true;
5098 }
5099 return false;
5100 };
5101
5102 // Handle UNDEFs.
5103 if (Op.isUndef()) {
5104 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
5105 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5106 return CastBitData(UndefSrcElts, SrcEltBits);
5107 }
5108
5109 // Extract scalar constant bits.
5110 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5111 APInt UndefSrcElts = APInt::getZero(1);
5112 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5113 return CastBitData(UndefSrcElts, SrcEltBits);
5114 }
5115 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5116 APInt UndefSrcElts = APInt::getZero(1);
5117 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5118 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5119 return CastBitData(UndefSrcElts, SrcEltBits);
5120 }
5121
5122 // Extract constant bits from build vector.
5123 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
5124 BitVector Undefs;
5125 SmallVector<APInt> SrcEltBits;
5126 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5127 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5128 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
5129 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
5130 if (Undefs[I])
5131 UndefSrcElts.setBit(I);
5132 return CastBitData(UndefSrcElts, SrcEltBits);
5133 }
5134 }
5135
5136 // Extract constant bits from constant pool vector.
5137 if (auto *Cst = getTargetConstantFromNode(Op)) {
5138 Type *CstTy = Cst->getType();
5139 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5140 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5141 return false;
5142
5143 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5144 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5145 if ((SizeInBits % SrcEltSizeInBits) != 0)
5146 return false;
5147
5148 APInt UndefSrcElts(NumSrcElts, 0);
5149 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5150 for (unsigned i = 0; i != NumSrcElts; ++i)
5151 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5152 UndefSrcElts, i))
5153 return false;
5154
5155 return CastBitData(UndefSrcElts, SrcEltBits);
5156 }
5157
5158 // Extract constant bits from a broadcasted constant pool scalar.
5159 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5160 EltSizeInBits <= VT.getScalarSizeInBits()) {
5161 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5162 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5163 return false;
5164
5165 SDValue Ptr = MemIntr->getBasePtr();
5167 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5168 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5169
5170 APInt UndefSrcElts(NumSrcElts, 0);
5171 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5172 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
5173 if (UndefSrcElts[0])
5174 UndefSrcElts.setBits(0, NumSrcElts);
5175 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
5176 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5177 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5178 return CastBitData(UndefSrcElts, SrcEltBits);
5179 }
5180 }
5181 }
5182
5183 // Extract constant bits from a subvector broadcast.
5184 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5185 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5186 SDValue Ptr = MemIntr->getBasePtr();
5187 // The source constant may be larger than the subvector broadcast,
5188 // ensure we extract the correct subvector constants.
5189 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
5190 Type *CstTy = Cst->getType();
5191 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5192 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5193 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5194 (SizeInBits % SubVecSizeInBits) != 0)
5195 return false;
5196 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5197 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5198 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5199 APInt UndefSubElts(NumSubElts, 0);
5200 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
5201 APInt(CstEltSizeInBits, 0));
5202 for (unsigned i = 0; i != NumSubElts; ++i) {
5203 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5204 UndefSubElts, i))
5205 return false;
5206 for (unsigned j = 1; j != NumSubVecs; ++j)
5207 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5208 }
5209 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
5210 UndefSubElts);
5211 return CastBitData(UndefSubElts, SubEltBits);
5212 }
5213 }
5214
5215 // Extract a rematerialized scalar constant insertion.
5216 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5217 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5218 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5219 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5220 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5221
5222 APInt UndefSrcElts(NumSrcElts, 0);
5223 SmallVector<APInt, 64> SrcEltBits;
5224 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
5225 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
5226 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5227 return CastBitData(UndefSrcElts, SrcEltBits);
5228 }
5229
5230 // Insert constant bits from a base and sub vector sources.
5231 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
5232 // If bitcasts to larger elements we might lose track of undefs - don't
5233 // allow any to be safe.
5234 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5235 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5236
5237 APInt UndefSrcElts, UndefSubElts;
5238 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
5239 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
5240 UndefSubElts, EltSubBits,
5241 AllowWholeUndefs && AllowUndefs,
5242 AllowPartialUndefs && AllowUndefs) &&
5243 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
5244 UndefSrcElts, EltSrcBits,
5245 AllowWholeUndefs && AllowUndefs,
5246 AllowPartialUndefs && AllowUndefs)) {
5247 unsigned BaseIdx = Op.getConstantOperandVal(2);
5248 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
5249 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
5250 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5251 return CastBitData(UndefSrcElts, EltSrcBits);
5252 }
5253 }
5254
5255 // Extract constant bits from a subvector's source.
5256 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5257 getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts,
5258 EltBits, AllowWholeUndefs,
5259 AllowPartialUndefs)) {
5260 EVT SrcVT = Op.getOperand(0).getValueType();
5261 unsigned NumSrcElts = SrcVT.getSizeInBits() / EltSizeInBits;
5262 unsigned NumSubElts = VT.getSizeInBits() / EltSizeInBits;
5263 unsigned BaseOfs = Op.getConstantOperandVal(1) * VT.getScalarSizeInBits();
5264 unsigned BaseIdx = BaseOfs / EltSizeInBits;
5265 assert((SrcVT.getSizeInBits() % EltSizeInBits) == 0 &&
5266 (VT.getSizeInBits() % EltSizeInBits) == 0 &&
5267 (BaseOfs % EltSizeInBits) == 0 && "Bad subvector index");
5268
5269 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5270 if ((BaseIdx + NumSubElts) != NumSrcElts)
5271 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5272 if (BaseIdx != 0)
5273 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5274 return true;
5275 }
5276
5277 // Extract constant bits from shuffle node sources.
5278 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5279 // TODO - support shuffle through bitcasts.
5280 if (EltSizeInBits != VT.getScalarSizeInBits())
5281 return false;
5282
5283 ArrayRef<int> Mask = SVN->getMask();
5284 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5285 llvm::any_of(Mask, [](int M) { return M < 0; }))
5286 return false;
5287
5288 APInt UndefElts0, UndefElts1;
5289 SmallVector<APInt, 32> EltBits0, EltBits1;
5290 if (isAnyInRange(Mask, 0, NumElts) &&
5291 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5292 UndefElts0, EltBits0, AllowWholeUndefs,
5293 AllowPartialUndefs))
5294 return false;
5295 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5296 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5297 UndefElts1, EltBits1, AllowWholeUndefs,
5298 AllowPartialUndefs))
5299 return false;
5300
5301 UndefElts = APInt::getZero(NumElts);
5302 for (int i = 0; i != (int)NumElts; ++i) {
5303 int M = Mask[i];
5304 if (M < 0) {
5305 UndefElts.setBit(i);
5306 EltBits.push_back(APInt::getZero(EltSizeInBits));
5307 } else if (M < (int)NumElts) {
5308 if (UndefElts0[M])
5309 UndefElts.setBit(i);
5310 EltBits.push_back(EltBits0[M]);
5311 } else {
5312 if (UndefElts1[M - NumElts])
5313 UndefElts.setBit(i);
5314 EltBits.push_back(EltBits1[M - NumElts]);
5315 }
5316 }
5317 return true;
5318 }
5319
5320 return false;
5321}
5322
5323namespace llvm {
5324namespace X86 {
5325bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5326 APInt UndefElts;
5327 SmallVector<APInt, 16> EltBits;
5329 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5330 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5331 int SplatIndex = -1;
5332 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5333 if (UndefElts[i])
5334 continue;
5335 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5336 SplatIndex = -1;
5337 break;
5338 }
5339 SplatIndex = i;
5340 }
5341 if (0 <= SplatIndex) {
5342 SplatVal = EltBits[SplatIndex];
5343 return true;
5344 }
5345 }
5346
5347 return false;
5348}
5349} // namespace X86
5350} // namespace llvm
5351
5353 unsigned MaskEltSizeInBits,
5355 APInt &UndefElts) {
5356 // Extract the raw target constant bits.
5357 SmallVector<APInt, 64> EltBits;
5358 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5359 EltBits, /* AllowWholeUndefs */ true,
5360 /* AllowPartialUndefs */ false))
5361 return false;
5362
5363 // Insert the extracted elements into the mask.
5364 for (const APInt &Elt : EltBits)
5365 RawMask.push_back(Elt.getZExtValue());
5366
5367 return true;
5368}
5369
5370static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts,
5371 bool AllowUndefs) {
5372 APInt UndefElts;
5373 SmallVector<APInt, 64> EltBits;
5374 if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits,
5375 /*AllowWholeUndefs*/ AllowUndefs,
5376 /*AllowPartialUndefs*/ false))
5377 return false;
5378
5379 bool IsPow2OrUndef = true;
5380 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
5381 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
5382 return IsPow2OrUndef;
5383}
5384
5385// Helper to attempt to return a cheaper, bit-inverted version of \p V.
5387 // TODO: don't always ignore oneuse constraints.
5388 V = peekThroughBitcasts(V);
5389 EVT VT = V.getValueType();
5390
5391 // Match not(xor X, -1) -> X.
5392 if (V.getOpcode() == ISD::XOR &&
5393 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5394 isAllOnesConstant(V.getOperand(1))))
5395 return V.getOperand(0);
5396
5397 // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5398 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5399 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5400 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5401 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5402 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not,
5403 V.getOperand(1));
5404 }
5405 }
5406
5407 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5408 if (V.getOpcode() == X86ISD::PCMPGT &&
5409 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5410 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5411 V.getOperand(0).hasOneUse()) {
5412 APInt UndefElts;
5413 SmallVector<APInt> EltBits;
5414 if (getTargetConstantBitsFromNode(V.getOperand(0),
5415 V.getScalarValueSizeInBits(), UndefElts,
5416 EltBits) &&
5417 !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) {
5418 // Don't fold min_signed_value -> (min_signed_value - 1)
5419 bool MinSigned = false;
5420 for (APInt &Elt : EltBits) {
5421 MinSigned |= Elt.isMinSignedValue();
5422 Elt -= 1;
5423 }
5424 if (!MinSigned) {
5425 SDLoc DL(V);
5426 MVT VT = V.getSimpleValueType();
5427 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5428 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5429 }
5430 }
5431 }
5432
5433 // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5435 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5436 for (SDValue &CatOp : CatOps) {
5437 SDValue NotCat = IsNOT(CatOp, DAG);
5438 if (!NotCat)
5439 return SDValue();
5440 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5441 }
5442 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps);
5443 }
5444
5445 // Match not(or(not(X),not(Y))) -> and(X, Y).
5446 if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5447 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5448 // TODO: Handle cases with single NOT operand -> ANDNP
5449 if (SDValue Op1 = IsNOT(V.getOperand(1), DAG))
5450 if (SDValue Op0 = IsNOT(V.getOperand(0), DAG))
5451 return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0),
5452 DAG.getBitcast(VT, Op1));
5453 }
5454
5455 return SDValue();
5456}
5457
5458/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5459/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5460/// Note: This ignores saturation, so inputs must be checked first.
5462 bool Unary, unsigned NumStages = 1) {
5463 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5464 unsigned NumElts = VT.getVectorNumElements();
5465 unsigned NumLanes = VT.getSizeInBits() / 128;
5466 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5467 unsigned Offset = Unary ? 0 : NumElts;
5468 unsigned Repetitions = 1u << (NumStages - 1);
5469 unsigned Increment = 1u << NumStages;
5470 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5471
5472 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5473 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5474 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5475 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5476 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5477 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5478 }
5479 }
5480}
5481
5482// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5483static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5484 APInt &DemandedLHS, APInt &DemandedRHS) {
5485 int NumLanes = VT.getSizeInBits() / 128;
5486 int NumElts = DemandedElts.getBitWidth();
5487 int NumInnerElts = NumElts / 2;
5488 int NumEltsPerLane = NumElts / NumLanes;
5489 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5490
5491 DemandedLHS = APInt::getZero(NumInnerElts);
5492 DemandedRHS = APInt::getZero(NumInnerElts);
5493
5494 // Map DemandedElts to the packed operands.
5495 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5496 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5497 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5498 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5499 if (DemandedElts[OuterIdx])
5500 DemandedLHS.setBit(InnerIdx);
5501 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5502 DemandedRHS.setBit(InnerIdx);
5503 }
5504 }
5505}
5506
5507// Split the demanded elts of a HADD/HSUB node between its operands.
5508static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5509 APInt &DemandedLHS, APInt &DemandedRHS) {
5511 DemandedLHS, DemandedRHS);
5512 DemandedLHS |= DemandedLHS << 1;
5513 DemandedRHS |= DemandedRHS << 1;
5514}
5515
5516/// Calculates the shuffle mask corresponding to the target-specific opcode.
5517/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5518/// operands in \p Ops, and returns true.
5519/// Sets \p IsUnary to true if only one source is used. Note that this will set
5520/// IsUnary for shuffles which use a single input multiple times, and in those
5521/// cases it will adjust the mask to only have indices within that single input.
5522/// It is an error to call this with non-empty Mask/Ops vectors.
5523static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5525 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5526 if (!isTargetShuffle(N.getOpcode()))
5527 return false;
5528
5529 MVT VT = N.getSimpleValueType();
5530 unsigned NumElems = VT.getVectorNumElements();
5531 unsigned MaskEltSize = VT.getScalarSizeInBits();
5533 APInt RawUndefs;
5534 uint64_t ImmN;
5535
5536 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5537 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5538
5539 IsUnary = false;
5540 bool IsFakeUnary = false;
5541 switch (N.getOpcode()) {
5542 case X86ISD::BLENDI:
5543 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5544 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5545 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5546 DecodeBLENDMask(NumElems, ImmN, Mask);
5547 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5548 break;
5549 case X86ISD::SHUFP:
5550 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5551 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5552 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5553 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5554 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5555 break;
5556 case X86ISD::INSERTPS:
5557 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5558 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5559 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5560 DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false);
5561 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5562 break;
5563 case X86ISD::EXTRQI:
5564 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5565 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5566 isa<ConstantSDNode>(N.getOperand(2))) {
5567 int BitLen = N.getConstantOperandVal(1);
5568 int BitIdx = N.getConstantOperandVal(2);
5569 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5570 IsUnary = true;
5571 }
5572 break;
5573 case X86ISD::INSERTQI:
5574 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5575 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5576 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5577 isa<ConstantSDNode>(N.getOperand(3))) {
5578 int BitLen = N.getConstantOperandVal(2);
5579 int BitIdx = N.getConstantOperandVal(3);
5580 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5581 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5582 }
5583 break;
5584 case X86ISD::UNPCKH:
5585 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5586 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5587 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5588 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5589 break;
5590 case X86ISD::UNPCKL:
5591 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5592 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5593 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5594 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5595 break;
5596 case X86ISD::MOVHLPS:
5597 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5598 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5599 DecodeMOVHLPSMask(NumElems, Mask);
5600 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5601 break;
5602 case X86ISD::MOVLHPS:
5603 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5604 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5605 DecodeMOVLHPSMask(NumElems, Mask);
5606 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5607 break;
5608 case X86ISD::VALIGN:
5609 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5610 "Only 32-bit and 64-bit elements are supported!");
5611 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5612 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5613 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5614 DecodeVALIGNMask(NumElems, ImmN, Mask);
5615 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5616 Ops.push_back(N.getOperand(1));
5617 Ops.push_back(N.getOperand(0));
5618 break;
5619 case X86ISD::PALIGNR:
5620 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5621 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5622 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5623 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5624 DecodePALIGNRMask(NumElems, ImmN, Mask);
5625 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5626 Ops.push_back(N.getOperand(1));
5627 Ops.push_back(N.getOperand(0));
5628 break;
5629 case X86ISD::VSHLDQ:
5630 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5631 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5632 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5633 DecodePSLLDQMask(NumElems, ImmN, Mask);
5634 IsUnary = true;
5635 break;
5636 case X86ISD::VSRLDQ:
5637 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5638 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5639 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5640 DecodePSRLDQMask(NumElems, ImmN, Mask);
5641 IsUnary = true;
5642 break;
5643 case X86ISD::PSHUFD:
5644 case X86ISD::VPERMILPI:
5645 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5646 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5647 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5648 IsUnary = true;
5649 break;
5650 case X86ISD::PSHUFHW:
5651 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5652 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5653 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5654 IsUnary = true;
5655 break;
5656 case X86ISD::PSHUFLW:
5657 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5658 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5659 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5660 IsUnary = true;
5661 break;
5662 case X86ISD::VZEXT_MOVL:
5663 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5664 DecodeZeroMoveLowMask(NumElems, Mask);
5665 IsUnary = true;
5666 break;
5667 case X86ISD::VBROADCAST:
5668 // We only decode broadcasts of same-sized vectors, peeking through to
5669 // extracted subvectors is likely to cause hasOneUse issues with
5670 // SimplifyDemandedBits etc.
5671 if (N.getOperand(0).getValueType() == VT) {
5672 DecodeVectorBroadcast(NumElems, Mask);
5673 IsUnary = true;
5674 break;
5675 }
5676 return false;
5677 case X86ISD::VPERMILPV: {
5678 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5679 IsUnary = true;
5680 SDValue MaskNode = N.getOperand(1);
5681 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5682 RawUndefs)) {
5683 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5684 break;
5685 }
5686 return false;
5687 }
5688 case X86ISD::PSHUFB: {
5689 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5690 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5691 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5692 IsUnary = true;
5693 SDValue MaskNode = N.getOperand(1);
5694 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5695 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5696 break;
5697 }
5698 return false;
5699 }
5700 case X86ISD::VPERMI:
5701 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5702 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5703 DecodeVPERMMask(NumElems, ImmN, Mask);
5704 IsUnary = true;
5705 break;
5706 case X86ISD::MOVSS:
5707 case X86ISD::MOVSD:
5708 case X86ISD::MOVSH:
5709 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5710 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5711 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5712 break;
5713 case X86ISD::VPERM2X128:
5714 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5715 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5716 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5717 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5718 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5719 break;
5720 case X86ISD::SHUF128:
5721 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5722 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5723 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5724 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5725 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5726 break;
5727 case X86ISD::MOVSLDUP:
5728 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5729 DecodeMOVSLDUPMask(NumElems, Mask);
5730 IsUnary = true;
5731 break;
5732 case X86ISD::MOVSHDUP:
5733 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5734 DecodeMOVSHDUPMask(NumElems, Mask);
5735 IsUnary = true;
5736 break;
5737 case X86ISD::MOVDDUP:
5738 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5739 DecodeMOVDDUPMask(NumElems, Mask);
5740 IsUnary = true;
5741 break;
5742 case X86ISD::VPERMIL2: {
5743 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5744 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5745 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5746 SDValue MaskNode = N.getOperand(2);
5747 SDValue CtrlNode = N.getOperand(3);
5748 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5749 unsigned CtrlImm = CtrlOp->getZExtValue();
5750 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5751 RawUndefs)) {
5752 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5753 Mask);
5754 break;
5755 }
5756 }
5757 return false;
5758 }
5759 case X86ISD::VPPERM: {
5760 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5761 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5762 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5763 SDValue MaskNode = N.getOperand(2);
5764 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5765 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5766 break;
5767 }
5768 return false;
5769 }
5770 case X86ISD::VPERMV: {
5771 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5772 IsUnary = true;
5773 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5774 Ops.push_back(N.getOperand(1));
5775 SDValue MaskNode = N.getOperand(0);
5776 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5777 RawUndefs)) {
5778 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5779 break;
5780 }
5781 return false;
5782 }
5783 case X86ISD::VPERMV3: {
5784 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5785 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5786 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5787 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5788 Ops.push_back(N.getOperand(0));
5789 Ops.push_back(N.getOperand(2));
5790 SDValue MaskNode = N.getOperand(1);
5791 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5792 RawUndefs)) {
5793 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5794 break;
5795 }
5796 return false;
5797 }
5798 default:
5799 llvm_unreachable("unknown target shuffle node");
5800 }
5801
5802 // Empty mask indicates the decode failed.
5803 if (Mask.empty())
5804 return false;
5805
5806 // Check if we're getting a shuffle mask with zero'd elements.
5807 if (!AllowSentinelZero && isAnyZero(Mask))
5808 return false;
5809
5810 // If we have a fake unary shuffle, the shuffle mask is spread across two
5811 // inputs that are actually the same node. Re-map the mask to always point
5812 // into the first input.
5813 if (IsFakeUnary)
5814 for (int &M : Mask)
5815 if (M >= (int)Mask.size())
5816 M -= Mask.size();
5817
5818 // If we didn't already add operands in the opcode-specific code, default to
5819 // adding 1 or 2 operands starting at 0.
5820 if (Ops.empty()) {
5821 Ops.push_back(N.getOperand(0));
5822 if (!IsUnary || IsFakeUnary)
5823 Ops.push_back(N.getOperand(1));
5824 }
5825
5826 return true;
5827}
5828
5829// Wrapper for getTargetShuffleMask with InUnary;
5830static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5832 SmallVectorImpl<int> &Mask) {
5833 bool IsUnary;
5834 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5835}
5836
5837/// Compute whether each element of a shuffle is zeroable.
5838///
5839/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5840/// Either it is an undef element in the shuffle mask, the element of the input
5841/// referenced is undef, or the element of the input referenced is known to be
5842/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5843/// as many lanes with this technique as possible to simplify the remaining
5844/// shuffle.
5846 SDValue V1, SDValue V2,
5847 APInt &KnownUndef, APInt &KnownZero) {
5848 int Size = Mask.size();
5849 KnownUndef = KnownZero = APInt::getZero(Size);
5850
5851 V1 = peekThroughBitcasts(V1);
5852 V2 = peekThroughBitcasts(V2);
5853
5854 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5855 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5856
5857 int VectorSizeInBits = V1.getValueSizeInBits();
5858 int ScalarSizeInBits = VectorSizeInBits / Size;
5859 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5860
5861 for (int i = 0; i < Size; ++i) {
5862 int M = Mask[i];
5863 // Handle the easy cases.
5864 if (M < 0) {
5865 KnownUndef.setBit(i);
5866 continue;
5867 }
5868 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5869 KnownZero.setBit(i);
5870 continue;
5871 }
5872
5873 // Determine shuffle input and normalize the mask.
5874 SDValue V = M < Size ? V1 : V2;
5875 M %= Size;
5876
5877 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5878 if (V.getOpcode() != ISD::BUILD_VECTOR)
5879 continue;
5880
5881 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5882 // the (larger) source element must be UNDEF/ZERO.
5883 if ((Size % V.getNumOperands()) == 0) {
5884 int Scale = Size / V->getNumOperands();
5885 SDValue Op = V.getOperand(M / Scale);
5886 if (Op.isUndef())
5887 KnownUndef.setBit(i);
5888 if (X86::isZeroNode(Op))
5889 KnownZero.setBit(i);
5890 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5891 APInt Val = Cst->getAPIntValue();
5892 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5893 if (Val == 0)
5894 KnownZero.setBit(i);
5895 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5896 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5897 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5898 if (Val == 0)
5899 KnownZero.setBit(i);
5900 }
5901 continue;
5902 }
5903
5904 // If the BUILD_VECTOR has more elements then all the (smaller) source
5905 // elements must be UNDEF or ZERO.
5906 if ((V.getNumOperands() % Size) == 0) {
5907 int Scale = V->getNumOperands() / Size;
5908 bool AllUndef = true;
5909 bool AllZero = true;
5910 for (int j = 0; j < Scale; ++j) {
5911 SDValue Op = V.getOperand((M * Scale) + j);
5912 AllUndef &= Op.isUndef();
5913 AllZero &= X86::isZeroNode(Op);
5914 }
5915 if (AllUndef)
5916 KnownUndef.setBit(i);
5917 if (AllZero)
5918 KnownZero.setBit(i);
5919 continue;
5920 }
5921 }
5922}
5923
5924/// Decode a target shuffle mask and inputs and see if any values are
5925/// known to be undef or zero from their inputs.
5926/// Returns true if the target shuffle mask was decoded.
5927/// FIXME: Merge this with computeZeroableShuffleElements?
5930 APInt &KnownUndef, APInt &KnownZero) {
5931 bool IsUnary;
5932 if (!isTargetShuffle(N.getOpcode()))
5933 return false;
5934
5935 MVT VT = N.getSimpleValueType();
5936 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
5937 return false;
5938
5939 int Size = Mask.size();
5940 SDValue V1 = Ops[0];
5941 SDValue V2 = IsUnary ? V1 : Ops[1];
5942 KnownUndef = KnownZero = APInt::getZero(Size);
5943
5944 V1 = peekThroughBitcasts(V1);
5945 V2 = peekThroughBitcasts(V2);
5946
5947 assert((VT.getSizeInBits() % Size) == 0 &&
5948 "Illegal split of shuffle value type");
5949 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5950
5951 // Extract known constant input data.
5952 APInt UndefSrcElts[2];
5953 SmallVector<APInt, 32> SrcEltBits[2];
5954 bool IsSrcConstant[2] = {
5955 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5956 SrcEltBits[0], /*AllowWholeUndefs*/ true,
5957 /*AllowPartialUndefs*/ false),
5958 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5959 SrcEltBits[1], /*AllowWholeUndefs*/ true,
5960 /*AllowPartialUndefs*/ false)};
5961
5962 for (int i = 0; i < Size; ++i) {
5963 int M = Mask[i];
5964
5965 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5966 if (M < 0) {
5967 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5968 if (SM_SentinelUndef == M)
5969 KnownUndef.setBit(i);
5970 if (SM_SentinelZero == M)
5971 KnownZero.setBit(i);
5972 continue;
5973 }
5974
5975 // Determine shuffle input and normalize the mask.
5976 unsigned SrcIdx = M / Size;
5977 SDValue V = M < Size ? V1 : V2;
5978 M %= Size;
5979
5980 // We are referencing an UNDEF input.
5981 if (V.isUndef()) {
5982 KnownUndef.setBit(i);
5983 continue;
5984 }
5985
5986 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5987 // TODO: We currently only set UNDEF for integer types - floats use the same
5988 // registers as vectors and many of the scalar folded loads rely on the
5989 // SCALAR_TO_VECTOR pattern.
5990 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5991 (Size % V.getValueType().getVectorNumElements()) == 0) {
5992 int Scale = Size / V.getValueType().getVectorNumElements();
5993 int Idx = M / Scale;
5994 if (Idx != 0 && !VT.isFloatingPoint())
5995 KnownUndef.setBit(i);
5996 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5997 KnownZero.setBit(i);
5998 continue;
5999 }
6000
6001 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
6002 // base vectors.
6003 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
6004 SDValue Vec = V.getOperand(0);
6005 int NumVecElts = Vec.getValueType().getVectorNumElements();
6006 if (Vec.isUndef() && Size == NumVecElts) {
6007 int Idx = V.getConstantOperandVal(2);
6008 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
6009 if (M < Idx || (Idx + NumSubElts) <= M)
6010 KnownUndef.setBit(i);
6011 }
6012 continue;
6013 }
6014
6015 // Attempt to extract from the source's constant bits.
6016 if (IsSrcConstant[SrcIdx]) {
6017 if (UndefSrcElts[SrcIdx][M])
6018 KnownUndef.setBit(i);
6019 else if (SrcEltBits[SrcIdx][M] == 0)
6020 KnownZero.setBit(i);
6021 }
6022 }
6023
6024 assert(VT.getVectorNumElements() == (unsigned)Size &&
6025 "Different mask size from vector size!");
6026 return true;
6027}
6028
6029// Replace target shuffle mask elements with known undef/zero sentinels.
6031 const APInt &KnownUndef,
6032 const APInt &KnownZero,
6033 bool ResolveKnownZeros= true) {
6034 unsigned NumElts = Mask.size();
6035 assert(KnownUndef.getBitWidth() == NumElts &&
6036 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
6037
6038 for (unsigned i = 0; i != NumElts; ++i) {
6039 if (KnownUndef[i])
6040 Mask[i] = SM_SentinelUndef;
6041 else if (ResolveKnownZeros && KnownZero[i])
6042 Mask[i] = SM_SentinelZero;
6043 }
6044}
6045
6046// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
6048 APInt &KnownUndef,
6049 APInt &KnownZero) {
6050 unsigned NumElts = Mask.size();
6051 KnownUndef = KnownZero = APInt::getZero(NumElts);
6052
6053 for (unsigned i = 0; i != NumElts; ++i) {
6054 int M = Mask[i];
6055 if (SM_SentinelUndef == M)
6056 KnownUndef.setBit(i);
6057 if (SM_SentinelZero == M)
6058 KnownZero.setBit(i);
6059 }
6060}
6061
6062// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
6064 SDValue Cond, bool IsBLENDV = false) {
6065 EVT CondVT = Cond.getValueType();
6066 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
6067 unsigned NumElts = CondVT.getVectorNumElements();
6068
6069 APInt UndefElts;
6070 SmallVector<APInt, 32> EltBits;
6071 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
6072 /*AllowWholeUndefs*/ true,
6073 /*AllowPartialUndefs*/ false))
6074 return false;
6075
6076 Mask.resize(NumElts, SM_SentinelUndef);
6077
6078 for (int i = 0; i != (int)NumElts; ++i) {
6079 Mask[i] = i;
6080 // Arbitrarily choose from the 2nd operand if the select condition element
6081 // is undef.
6082 // TODO: Can we do better by matching patterns such as even/odd?
6083 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
6084 (IsBLENDV && EltBits[i].isNonNegative()))
6085 Mask[i] += NumElts;
6086 }
6087
6088 return true;
6089}
6090
6091// Forward declaration (for getFauxShuffleMask recursive check).
6092static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6095 const SelectionDAG &DAG, unsigned Depth,
6096 bool ResolveKnownElts);
6097
6098// Attempt to decode ops that could be represented as a shuffle mask.
6099// The decoded shuffle mask may contain a different number of elements to the
6100// destination value type.
6101// TODO: Merge into getTargetShuffleInputs()
6102static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
6105 const SelectionDAG &DAG, unsigned Depth,
6106 bool ResolveKnownElts) {
6107 Mask.clear();
6108 Ops.clear();
6109
6110 MVT VT = N.getSimpleValueType();
6111 unsigned NumElts = VT.getVectorNumElements();
6112 unsigned NumSizeInBits = VT.getSizeInBits();
6113 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6114 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6115 return false;
6116 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
6117 unsigned NumSizeInBytes = NumSizeInBits / 8;
6118 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6119
6120 unsigned Opcode = N.getOpcode();
6121 switch (Opcode) {
6122 case ISD::VECTOR_SHUFFLE: {
6123 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6124 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6125 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6126 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6127 Ops.push_back(N.getOperand(0));
6128 Ops.push_back(N.getOperand(1));
6129 return true;
6130 }
6131 return false;
6132 }
6133 case ISD::AND:
6134 case X86ISD::ANDNP: {
6135 // Attempt to decode as a per-byte mask.
6136 APInt UndefElts;
6137 SmallVector<APInt, 32> EltBits;
6138 SDValue N0 = N.getOperand(0);
6139 SDValue N1 = N.getOperand(1);
6140 bool IsAndN = (X86ISD::ANDNP == Opcode);
6141 uint64_t ZeroMask = IsAndN ? 255 : 0;
6142 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
6143 /*AllowWholeUndefs*/ false,
6144 /*AllowPartialUndefs*/ false))
6145 return false;
6146 // We can't assume an undef src element gives an undef dst - the other src
6147 // might be zero.
6148 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
6149 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6150 const APInt &ByteBits = EltBits[i];
6151 if (ByteBits != 0 && ByteBits != 255)
6152 return false;
6153 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6154 }
6155 Ops.push_back(IsAndN ? N1 : N0);
6156 return true;
6157 }
6158 case ISD::OR: {
6159 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
6160 // is a valid shuffle index.
6161 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
6162 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
6163 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
6164 return false;
6165
6166 SmallVector<int, 64> SrcMask0, SrcMask1;
6167 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
6170 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
6171 Depth + 1, true) ||
6172 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
6173 Depth + 1, true))
6174 return false;
6175
6176 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
6177 SmallVector<int, 64> Mask0, Mask1;
6178 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
6179 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
6180 for (int i = 0; i != (int)MaskSize; ++i) {
6181 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
6182 // loops converting between OR and BLEND shuffles due to
6183 // canWidenShuffleElements merging away undef elements, meaning we
6184 // fail to recognise the OR as the undef element isn't known zero.
6185 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
6186 Mask.push_back(SM_SentinelZero);
6187 else if (Mask1[i] == SM_SentinelZero)
6188 Mask.push_back(i);
6189 else if (Mask0[i] == SM_SentinelZero)
6190 Mask.push_back(i + MaskSize);
6191 else
6192 return false;
6193 }
6194 Ops.push_back(N.getOperand(0));
6195 Ops.push_back(N.getOperand(1));
6196 return true;
6197 }
6198 case ISD::CONCAT_VECTORS: {
6199 // Limit this to vXi64 vector cases to make the most of cross lane shuffles.
6200 unsigned NumSubElts = N.getOperand(0).getValueType().getVectorNumElements();
6201 if (NumBitsPerElt == 64) {
6202 for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
6203 for (unsigned M = 0; M != NumSubElts; ++M)
6204 Mask.push_back((I * NumElts) + M);
6205 Ops.push_back(N.getOperand(I));
6206 }
6207 return true;
6208 }
6209 return false;
6210 }
6211 case ISD::INSERT_SUBVECTOR: {
6212 SDValue Src = N.getOperand(0);
6213 SDValue Sub = N.getOperand(1);
6214 EVT SubVT = Sub.getValueType();
6215 unsigned NumSubElts = SubVT.getVectorNumElements();
6216 uint64_t InsertIdx = N.getConstantOperandVal(2);
6217 // Subvector isn't demanded - just return the base vector.
6218 if (DemandedElts.extractBits(NumSubElts, InsertIdx) == 0) {
6219 Mask.resize(NumElts);
6220 std::iota(Mask.begin(), Mask.end(), 0);
6221 Ops.push_back(Src);
6222 return true;
6223 }
6224 // Handle CONCAT(SUB0, SUB1).
6225 // Limit to vXi64/splat cases to make the most of cross lane shuffles.
6226 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6227 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6228 Src.getOperand(0).isUndef() &&
6229 Src.getOperand(1).getValueType() == SubVT &&
6230 Src.getConstantOperandVal(2) == 0 &&
6231 (NumBitsPerElt == 64 || Src.getOperand(1) == Sub) &&
6232 SDNode::areOnlyUsersOf({N.getNode(), Src.getNode()}, Sub.getNode())) {
6233 Mask.resize(NumElts);
6234 std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0);
6235 std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts);
6236 Ops.push_back(Src.getOperand(1));
6237 Ops.push_back(Sub);
6238 return true;
6239 }
6240 if (!N->isOnlyUserOf(Sub.getNode()))
6241 return false;
6242
6243 SmallVector<int, 64> SubMask;
6244 SmallVector<SDValue, 2> SubInputs;
6246 EVT SubSrcVT = SubSrc.getValueType();
6247 if (!SubSrcVT.isVector())
6248 return false;
6249
6250 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
6251 if (SubSrc.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6252 SubSrc.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6253 uint64_t ExtractIdx = SubSrc.getConstantOperandVal(1);
6254 SDValue SubSrcSrc = SubSrc.getOperand(0);
6255 unsigned NumSubSrcSrcElts =
6256 SubSrcSrc.getValueType().getVectorNumElements();
6257 unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts);
6258 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 &&
6259 "Subvector valuetype mismatch");
6260 InsertIdx *= (MaxElts / NumElts);
6261 ExtractIdx *= (MaxElts / NumSubSrcSrcElts);
6262 NumSubElts *= (MaxElts / NumElts);
6263 bool SrcIsUndef = Src.isUndef();
6264 for (int i = 0; i != (int)MaxElts; ++i)
6265 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
6266 for (int i = 0; i != (int)NumSubElts; ++i)
6267 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6268 if (!SrcIsUndef)
6269 Ops.push_back(Src);
6270 Ops.push_back(SubSrcSrc);
6271 return true;
6272 }
6273
6274 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
6275 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
6276 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
6277 Depth + 1, ResolveKnownElts))
6278 return false;
6279
6280 // Subvector shuffle inputs must not be larger than the subvector.
6281 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
6282 return SubVT.getFixedSizeInBits() <
6283 SubInput.getValueSizeInBits().getFixedValue();
6284 }))
6285 return false;
6286
6287 if (SubMask.size() != NumSubElts) {
6288 assert(((SubMask.size() % NumSubElts) == 0 ||
6289 (NumSubElts % SubMask.size()) == 0) &&
6290 "Illegal submask scale");
6291 if ((NumSubElts % SubMask.size()) == 0) {
6292 int Scale = NumSubElts / SubMask.size();
6293 SmallVector<int, 64> ScaledSubMask;
6294 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
6295 SubMask = ScaledSubMask;
6296 } else {
6297 int Scale = SubMask.size() / NumSubElts;
6298 NumSubElts = SubMask.size();
6299 NumElts *= Scale;
6300 InsertIdx *= Scale;
6301 }
6302 }
6303 Ops.push_back(Src);
6304 Ops.append(SubInputs.begin(), SubInputs.end());
6305 if (ISD::isBuildVectorAllZeros(Src.getNode()))
6306 Mask.append(NumElts, SM_SentinelZero);
6307 else
6308 for (int i = 0; i != (int)NumElts; ++i)
6309 Mask.push_back(i);
6310 for (int i = 0; i != (int)NumSubElts; ++i) {
6311 int M = SubMask[i];
6312 if (0 <= M) {
6313 int InputIdx = M / NumSubElts;
6314 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6315 }
6316 Mask[i + InsertIdx] = M;
6317 }
6318 return true;
6319 }
6320 case X86ISD::PINSRB:
6321 case X86ISD::PINSRW:
6324 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6325 // vector, for matching src/dst vector types.
6326 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6327
6328 unsigned DstIdx = 0;
6329 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6330 // Check we have an in-range constant insertion index.
6331 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6332 N.getConstantOperandAPInt(2).uge(NumElts))
6333 return false;
6334 DstIdx = N.getConstantOperandVal(2);
6335
6336 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6337 if (X86::isZeroNode(Scl)) {
6338 Ops.push_back(N.getOperand(0));
6339 for (unsigned i = 0; i != NumElts; ++i)
6340 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6341 return true;
6342 }
6343 }
6344
6345 // Peek through trunc/aext/zext/bitcast.
6346 // TODO: aext shouldn't require SM_SentinelZero padding.
6347 // TODO: handle shift of scalars.
6348 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6349 while (Scl.getOpcode() == ISD::TRUNCATE ||
6350 Scl.getOpcode() == ISD::ANY_EXTEND ||
6351 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6352 (Scl.getOpcode() == ISD::BITCAST &&
6355 Scl = Scl.getOperand(0);
6356 MinBitsPerElt =
6357 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6358 }
6359 if ((MinBitsPerElt % 8) != 0)
6360 return false;
6361
6362 // Attempt to find the source vector the scalar was extracted from.
6363 SDValue SrcExtract;
6364 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6365 Scl.getOpcode() == X86ISD::PEXTRW ||
6366 Scl.getOpcode() == X86ISD::PEXTRB) &&
6367 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6368 SrcExtract = Scl;
6369 }
6370 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6371 return false;
6372
6373 SDValue SrcVec = SrcExtract.getOperand(0);
6374 EVT SrcVT = SrcVec.getValueType();
6375 if (!SrcVT.getScalarType().isByteSized())
6376 return false;
6377 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6378 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6379 unsigned DstByte = DstIdx * NumBytesPerElt;
6380 MinBitsPerElt =
6381 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6382
6383 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6384 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6385 Ops.push_back(SrcVec);
6386 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6387 } else {
6388 Ops.push_back(SrcVec);
6389 Ops.push_back(N.getOperand(0));
6390 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6391 Mask.push_back(NumSizeInBytes + i);
6392 }
6393
6394 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6395 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6396 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6397 Mask[DstByte + i] = SrcByte + i;
6398 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6399 Mask[DstByte + i] = SM_SentinelZero;
6400 return true;
6401 }
6402 case X86ISD::PACKSS:
6403 case X86ISD::PACKUS: {
6404 SDValue N0 = N.getOperand(0);
6405 SDValue N1 = N.getOperand(1);
6406 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6407 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6408 "Unexpected input value type");
6409
6410 APInt EltsLHS, EltsRHS;
6411 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6412
6413 // If we know input saturation won't happen (or we don't care for particular
6414 // lanes), we can treat this as a truncation shuffle.
6415 bool Offset0 = false, Offset1 = false;
6416 if (Opcode == X86ISD::PACKSS) {
6417 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6418 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6419 (!(N1.isUndef() || EltsRHS.isZero()) &&
6420 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6421 return false;
6422 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6423 // PACKSS then it was likely being used for sign-extension for a
6424 // truncation, so just peek through and adjust the mask accordingly.
6425 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6426 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6427 Offset0 = true;
6428 N0 = N0.getOperand(0);
6429 }
6430 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6431 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6432 Offset1 = true;
6433 N1 = N1.getOperand(0);
6434 }
6435 } else {
6436 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6437 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6438 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6439 (!(N1.isUndef() || EltsRHS.isZero()) &&
6440 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6441 return false;
6442 }
6443
6444 bool IsUnary = (N0 == N1);
6445
6446 Ops.push_back(N0);
6447 if (!IsUnary)
6448 Ops.push_back(N1);
6449
6450 createPackShuffleMask(VT, Mask, IsUnary);
6451
6452 if (Offset0 || Offset1) {
6453 for (int &M : Mask)
6454 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6455 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6456 ++M;
6457 }
6458 return true;
6459 }
6460 case ISD::VSELECT:
6461 case X86ISD::BLENDV: {
6462 SDValue Cond = N.getOperand(0);
6463 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6464 Ops.push_back(N.getOperand(1));
6465 Ops.push_back(N.getOperand(2));
6466 return true;
6467 }
6468 return false;
6469 }
6470 case X86ISD::VTRUNC: {
6471 SDValue Src = N.getOperand(0);
6472 EVT SrcVT = Src.getValueType();
6473 if (SrcVT.getSizeInBits() != NumSizeInBits)
6474 return false;
6475 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6476 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6477 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6478 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6479 for (unsigned i = 0; i != NumSrcElts; ++i)
6480 Mask.push_back(i * Scale);
6481 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6482 Ops.push_back(Src);
6483 return true;
6484 }
6485 case ISD::SHL:
6486 case ISD::SRL: {
6487 APInt UndefElts;
6488 SmallVector<APInt, 32> EltBits;
6489 if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt,
6490 UndefElts, EltBits,
6491 /*AllowWholeUndefs*/ true,
6492 /*AllowPartialUndefs*/ false))
6493 return false;
6494
6495 // We can only decode 'whole byte' bit shifts as shuffles.
6496 for (unsigned I = 0; I != NumElts; ++I)
6497 if (DemandedElts[I] && !UndefElts[I] &&
6498 (EltBits[I].urem(8) != 0 || EltBits[I].uge(NumBitsPerElt)))
6499 return false;
6500
6501 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6502 Ops.push_back(N.getOperand(0));
6503
6504 for (unsigned I = 0; I != NumElts; ++I) {
6505 if (!DemandedElts[I] || UndefElts[I])
6506 continue;
6507 unsigned ByteShift = EltBits[I].getZExtValue() / 8;
6508 unsigned Lo = I * NumBytesPerElt;
6509 unsigned Hi = Lo + NumBytesPerElt;
6510 // Clear mask to all zeros and insert the shifted byte indices.
6511 std::fill(Mask.begin() + Lo, Mask.begin() + Hi, SM_SentinelZero);
6512 if (ISD::SHL == Opcode)
6513 std::iota(Mask.begin() + Lo + ByteShift, Mask.begin() + Hi, Lo);
6514 else
6515 std::iota(Mask.begin() + Lo, Mask.begin() + Hi - ByteShift,
6516 Lo + ByteShift);
6517 }
6518 return true;
6519 }
6520 case X86ISD::VSHLI:
6521 case X86ISD::VSRLI: {
6522 uint64_t ShiftVal = N.getConstantOperandVal(1);
6523 // Out of range bit shifts are guaranteed to be zero.
6524 if (NumBitsPerElt <= ShiftVal) {
6525 Mask.append(NumElts, SM_SentinelZero);
6526 return true;
6527 }
6528
6529 // We can only decode 'whole byte' bit shifts as shuffles.
6530 if ((ShiftVal % 8) != 0)
6531 break;
6532
6533 uint64_t ByteShift = ShiftVal / 8;
6534 Ops.push_back(N.getOperand(0));
6535
6536 // Clear mask to all zeros and insert the shifted byte indices.
6537 Mask.append(NumSizeInBytes, SM_SentinelZero);
6538
6539 if (X86ISD::VSHLI == Opcode) {
6540 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6541 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6542 Mask[i + j] = i + j - ByteShift;
6543 } else {
6544 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6545 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6546 Mask[i + j - ByteShift] = i + j;
6547 }
6548 return true;
6549 }
6550 case X86ISD::VROTLI:
6551 case X86ISD::VROTRI: {
6552 // We can only decode 'whole byte' bit rotates as shuffles.
6553 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6554 if ((RotateVal % 8) != 0)
6555 return false;
6556 Ops.push_back(N.getOperand(0));
6557 int Offset = RotateVal / 8;
6558 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6559 for (int i = 0; i != (int)NumElts; ++i) {
6560 int BaseIdx = i * NumBytesPerElt;
6561 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6562 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6563 }
6564 }
6565 return true;
6566 }
6567 case X86ISD::VBROADCAST: {
6568 SDValue Src = N.getOperand(0);
6569 if (!Src.getSimpleValueType().isVector()) {
6570 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6571 !isNullConstant(Src.getOperand(1)) ||
6572 Src.getOperand(0).getValueType().getScalarType() !=
6573 VT.getScalarType())
6574 return false;
6575 Src = Src.getOperand(0);
6576 }
6577 Ops.push_back(Src);
6578 Mask.append(NumElts, 0);
6579 return true;
6580 }
6582 SDValue Src = N.getOperand(0);
6583 EVT SrcVT = Src.getValueType();
6584 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6585
6586 // Extended source must be a simple vector.
6587 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6588 (NumBitsPerSrcElt % 8) != 0)
6589 return false;
6590
6591 // We can only handle all-signbits extensions.
6592 APInt DemandedSrcElts =
6593 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6594 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6595 return false;
6596
6597 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6598 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6599 for (unsigned I = 0; I != NumElts; ++I)
6600 Mask.append(Scale, I);
6601 Ops.push_back(Src);
6602 return true;
6603 }
6604 case ISD::ZERO_EXTEND:
6605 case ISD::ANY_EXTEND:
6608 SDValue Src = N.getOperand(0);
6609 EVT SrcVT = Src.getValueType();
6610
6611 // Extended source must be a simple vector.
6612 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6613 (SrcVT.getScalarSizeInBits() % 8) != 0)
6614 return false;
6615
6616 bool IsAnyExtend =
6617 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6618 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6619 IsAnyExtend, Mask);
6620 Ops.push_back(Src);
6621 return true;
6622 }
6623 }
6624
6625 return false;
6626}
6627
6628/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6630 SmallVectorImpl<int> &Mask) {
6631 int MaskWidth = Mask.size();
6632 SmallVector<SDValue, 16> UsedInputs;
6633 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6634 int lo = UsedInputs.size() * MaskWidth;
6635 int hi = lo + MaskWidth;
6636
6637 // Strip UNDEF input usage.
6638 if (Inputs[i].isUndef())
6639 for (int &M : Mask)
6640 if ((lo <= M) && (M < hi))
6641 M = SM_SentinelUndef;
6642
6643 // Check for unused inputs.
6644 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6645 for (int &M : Mask)
6646 if (lo <= M)
6647 M -= MaskWidth;
6648 continue;
6649 }
6650
6651 // Check for repeated inputs.
6652 bool IsRepeat = false;
6653 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6654 if (UsedInputs[j] != Inputs[i])
6655 continue;
6656 for (int &M : Mask)
6657 if (lo <= M)
6658 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6659 IsRepeat = true;
6660 break;
6661 }
6662 if (IsRepeat)
6663 continue;
6664
6665 UsedInputs.push_back(Inputs[i]);
6666 }
6667 Inputs = UsedInputs;
6668}
6669
6670/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6671/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6672/// Returns true if the target shuffle mask was decoded.
6673static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6676 APInt &KnownUndef, APInt &KnownZero,
6677 const SelectionDAG &DAG, unsigned Depth,
6678 bool ResolveKnownElts) {
6680 return false; // Limit search depth.
6681
6682 EVT VT = Op.getValueType();
6683 if (!VT.isSimple() || !VT.isVector())
6684 return false;
6685
6686 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6687 if (ResolveKnownElts)
6688 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6689 return true;
6690 }
6691 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6692 ResolveKnownElts)) {
6693 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6694 return true;
6695 }
6696 return false;
6697}
6698
6699static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6702 const SelectionDAG &DAG, unsigned Depth,
6703 bool ResolveKnownElts) {
6704 APInt KnownUndef, KnownZero;
6705 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6706 KnownZero, DAG, Depth, ResolveKnownElts);
6707}
6708
6711 const SelectionDAG &DAG, unsigned Depth = 0,
6712 bool ResolveKnownElts = true) {
6713 EVT VT = Op.getValueType();
6714 if (!VT.isSimple() || !VT.isVector())
6715 return false;
6716
6717 unsigned NumElts = Op.getValueType().getVectorNumElements();
6718 APInt DemandedElts = APInt::getAllOnes(NumElts);
6719 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6720 ResolveKnownElts);
6721}
6722
6723// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6724static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6725 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6726 SelectionDAG &DAG) {
6727 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6728 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6729 "Unknown broadcast load type");
6730
6731 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6732 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6733 return SDValue();
6734
6737 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6738 SDValue Ops[] = {Mem->getChain(), Ptr};
6739 SDValue BcstLd = DAG.getMemIntrinsicNode(
6740 Opcode, DL, Tys, Ops, MemVT,
6742 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6743 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6744 return BcstLd;
6745}
6746
6747/// Returns the scalar element that will make up the i'th
6748/// element of the result of the vector shuffle.
6749static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
6750 SelectionDAG &DAG, unsigned Depth) {
6752 return SDValue(); // Limit search depth.
6753
6754 EVT VT = Op.getValueType();
6755 unsigned Opcode = Op.getOpcode();
6756 unsigned NumElems = VT.getVectorNumElements();
6757
6758 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6759 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6760 int Elt = SV->getMaskElt(Index);
6761
6762 if (Elt < 0)
6763 return DAG.getUNDEF(VT.getVectorElementType());
6764
6765 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6766 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6767 }
6768
6769 // Recurse into target specific vector shuffles to find scalars.
6770 if (isTargetShuffle(Opcode)) {
6771 MVT ShufVT = VT.getSimpleVT();
6772 MVT ShufSVT = ShufVT.getVectorElementType();
6773 int NumElems = (int)ShufVT.getVectorNumElements();
6774 SmallVector<int, 16> ShuffleMask;
6776 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6777 return SDValue();
6778
6779 int Elt = ShuffleMask[Index];
6780 if (Elt == SM_SentinelZero)
6781 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6782 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6783 if (Elt == SM_SentinelUndef)
6784 return DAG.getUNDEF(ShufSVT);
6785
6786 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6787 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6788 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6789 }
6790
6791 // Recurse into insert_subvector base/sub vector to find scalars.
6792 if (Opcode == ISD::INSERT_SUBVECTOR) {
6793 SDValue Vec = Op.getOperand(0);
6794 SDValue Sub = Op.getOperand(1);
6795 uint64_t SubIdx = Op.getConstantOperandVal(2);
6796 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6797
6798 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6799 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6800 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6801 }
6802
6803 // Recurse into concat_vectors sub vector to find scalars.
6804 if (Opcode == ISD::CONCAT_VECTORS) {
6805 EVT SubVT = Op.getOperand(0).getValueType();
6806 unsigned NumSubElts = SubVT.getVectorNumElements();
6807 uint64_t SubIdx = Index / NumSubElts;
6808 uint64_t SubElt = Index % NumSubElts;
6809 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6810 }
6811
6812 // Recurse into extract_subvector src vector to find scalars.
6813 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6814 SDValue Src = Op.getOperand(0);
6815 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6816 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6817 }
6818
6819 // We only peek through bitcasts of the same vector width.
6820 if (Opcode == ISD::BITCAST) {
6821 SDValue Src = Op.getOperand(0);
6822 EVT SrcVT = Src.getValueType();
6823 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6824 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6825 return SDValue();
6826 }
6827
6828 // Actual nodes that may contain scalar elements
6829
6830 // For insert_vector_elt - either return the index matching scalar or recurse
6831 // into the base vector.
6832 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6833 isa<ConstantSDNode>(Op.getOperand(2))) {
6834 if (Op.getConstantOperandAPInt(2) == Index)
6835 return Op.getOperand(1);
6836 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6837 }
6838
6839 if (Opcode == ISD::SCALAR_TO_VECTOR)
6840 return (Index == 0) ? Op.getOperand(0)
6841 : DAG.getUNDEF(VT.getVectorElementType());
6842
6843 if (Opcode == ISD::BUILD_VECTOR)
6844 return Op.getOperand(Index);
6845
6846 return SDValue();
6847}
6848
6849// Use PINSRB/PINSRW/PINSRD to create a build vector.
6851 const APInt &NonZeroMask,
6852 unsigned NumNonZero, unsigned NumZero,
6853 SelectionDAG &DAG,
6854 const X86Subtarget &Subtarget) {
6855 MVT VT = Op.getSimpleValueType();
6856 unsigned NumElts = VT.getVectorNumElements();
6857 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6858 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6859 "Illegal vector insertion");
6860
6861 SDValue V;
6862 bool First = true;
6863
6864 for (unsigned i = 0; i < NumElts; ++i) {
6865 bool IsNonZero = NonZeroMask[i];
6866 if (!IsNonZero)
6867 continue;
6868
6869 // If the build vector contains zeros or our first insertion is not the
6870 // first index then insert into zero vector to break any register
6871 // dependency else use SCALAR_TO_VECTOR.
6872 if (First) {
6873 First = false;
6874 if (NumZero || 0 != i)
6875 V = getZeroVector(VT, Subtarget, DAG, DL);
6876 else {
6877 assert(0 == i && "Expected insertion into zero-index");
6878 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6879 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6880 V = DAG.getBitcast(VT, V);
6881 continue;
6882 }
6883 }
6884 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
6885 DAG.getVectorIdxConstant(i, DL));
6886 }
6887
6888 return V;
6889}
6890
6891/// Custom lower build_vector of v16i8.
6893 const APInt &NonZeroMask,
6894 unsigned NumNonZero, unsigned NumZero,
6895 SelectionDAG &DAG,
6896 const X86Subtarget &Subtarget) {
6897 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6898 return SDValue();
6899
6900 // SSE4.1 - use PINSRB to insert each byte directly.
6901 if (Subtarget.hasSSE41())
6902 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
6903 DAG, Subtarget);
6904
6905 SDValue V;
6906
6907 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6908 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6909 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6910 !NonZeroMask.extractBits(2, 2).isZero()) {
6911 for (unsigned I = 0; I != 4; ++I) {
6912 if (!NonZeroMask[I])
6913 continue;
6914 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
6915 if (I != 0)
6916 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
6917 DAG.getConstant(I * 8, DL, MVT::i8));
6918 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
6919 }
6920 assert(V && "Failed to fold v16i8 vector to zero");
6921 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6922 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
6923 V = DAG.getBitcast(MVT::v8i16, V);
6924 }
6925 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6926 bool ThisIsNonZero = NonZeroMask[i];
6927 bool NextIsNonZero = NonZeroMask[i + 1];
6928 if (!ThisIsNonZero && !NextIsNonZero)
6929 continue;
6930
6931 SDValue Elt;
6932 if (ThisIsNonZero) {
6933 if (NumZero || NextIsNonZero)
6934 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6935 else
6936 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6937 }
6938
6939 if (NextIsNonZero) {
6940 SDValue NextElt = Op.getOperand(i + 1);
6941 if (i == 0 && NumZero)
6942 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
6943 else
6944 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
6945 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
6946 DAG.getConstant(8, DL, MVT::i8));
6947 if (ThisIsNonZero)
6948 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
6949 else
6950 Elt = NextElt;
6951 }
6952
6953 // If our first insertion is not the first index or zeros are needed, then
6954 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6955 // elements undefined).
6956 if (!V) {
6957 if (i != 0 || NumZero)
6958 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
6959 else {
6960 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
6961 V = DAG.getBitcast(MVT::v8i16, V);
6962 continue;
6963 }
6964 }
6965 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6966 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
6967 DAG.getVectorIdxConstant(i / 2, DL));
6968 }
6969
6970 return DAG.getBitcast(MVT::v16i8, V);
6971}
6972
6973/// Custom lower build_vector of v8i16.
6975 const APInt &NonZeroMask,
6976 unsigned NumNonZero, unsigned NumZero,
6977 SelectionDAG &DAG,
6978 const X86Subtarget &Subtarget) {
6979 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6980 return SDValue();
6981
6982 // Use PINSRW to insert each byte directly.
6983 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
6984 Subtarget);
6985}
6986
6987/// Custom lower build_vector of v4i32 or v4f32.
6989 SelectionDAG &DAG,
6990 const X86Subtarget &Subtarget) {
6991 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
6992 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
6993 // Because we're creating a less complicated build vector here, we may enable
6994 // further folding of the MOVDDUP via shuffle transforms.
6995 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
6996 Op.getOperand(0) == Op.getOperand(2) &&
6997 Op.getOperand(1) == Op.getOperand(3) &&
6998 Op.getOperand(0) != Op.getOperand(1)) {
6999 MVT VT = Op.getSimpleValueType();
7000 MVT EltVT = VT.getVectorElementType();
7001 // Create a new build vector with the first 2 elements followed by undef
7002 // padding, bitcast to v2f64, duplicate, and bitcast back.
7003 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
7004 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
7005 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
7006 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
7007 return DAG.getBitcast(VT, Dup);
7008 }
7009
7010 // Find all zeroable elements.
7011 std::bitset<4> Zeroable, Undefs;
7012 for (int i = 0; i < 4; ++i) {
7013 SDValue Elt = Op.getOperand(i);
7014 Undefs[i] = Elt.isUndef();
7015 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
7016 }
7017 assert(Zeroable.size() - Zeroable.count() > 1 &&
7018 "We expect at least two non-zero elements!");
7019
7020 // We only know how to deal with build_vector nodes where elements are either
7021 // zeroable or extract_vector_elt with constant index.
7022 SDValue FirstNonZero;
7023 unsigned FirstNonZeroIdx;
7024 for (unsigned i = 0; i < 4; ++i) {
7025 if (Zeroable[i])
7026 continue;
7027 SDValue Elt = Op.getOperand(i);
7028 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7030 return SDValue();
7031 // Make sure that this node is extracting from a 128-bit vector.
7032 MVT VT = Elt.getOperand(0).getSimpleValueType();
7033 if (!VT.is128BitVector())
7034 return SDValue();
7035 if (!FirstNonZero.getNode()) {
7036 FirstNonZero = Elt;
7037 FirstNonZeroIdx = i;
7038 }
7039 }
7040
7041 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
7042 SDValue V1 = FirstNonZero.getOperand(0);
7043 MVT VT = V1.getSimpleValueType();
7044
7045 // See if this build_vector can be lowered as a blend with zero.
7046 SDValue Elt;
7047 unsigned EltMaskIdx, EltIdx;
7048 int Mask[4];
7049 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7050 if (Zeroable[EltIdx]) {
7051 // The zero vector will be on the right hand side.
7052 Mask[EltIdx] = EltIdx+4;
7053 continue;
7054 }
7055
7056 Elt = Op->getOperand(EltIdx);
7057 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
7058 EltMaskIdx = Elt.getConstantOperandVal(1);
7059 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
7060 break;
7061 Mask[EltIdx] = EltIdx;
7062 }
7063
7064 if (EltIdx == 4) {
7065 // Let the shuffle legalizer deal with blend operations.
7066 SDValue VZeroOrUndef = (Zeroable == Undefs)
7067 ? DAG.getUNDEF(VT)
7068 : getZeroVector(VT, Subtarget, DAG, DL);
7069 if (V1.getSimpleValueType() != VT)
7070 V1 = DAG.getBitcast(VT, V1);
7071 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
7072 }
7073
7074 // See if we can lower this build_vector to a INSERTPS.
7075 if (!Subtarget.hasSSE41())
7076 return SDValue();
7077
7078 SDValue V2 = Elt.getOperand(0);
7079 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7080 V1 = SDValue();
7081
7082 bool CanFold = true;
7083 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7084 if (Zeroable[i])
7085 continue;
7086
7087 SDValue Current = Op->getOperand(i);
7088 SDValue SrcVector = Current->getOperand(0);
7089 if (!V1.getNode())
7090 V1 = SrcVector;
7091 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
7092 }
7093
7094 if (!CanFold)
7095 return SDValue();
7096
7097 assert(V1.getNode() && "Expected at least two non-zero elements!");
7098 if (V1.getSimpleValueType() != MVT::v4f32)
7099 V1 = DAG.getBitcast(MVT::v4f32, V1);
7100 if (V2.getSimpleValueType() != MVT::v4f32)
7101 V2 = DAG.getBitcast(MVT::v4f32, V2);
7102
7103 // Ok, we can emit an INSERTPS instruction.
7104 unsigned ZMask = Zeroable.to_ulong();
7105
7106 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
7107 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
7108 SDValue Result =
7109 DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
7110 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
7111 return DAG.getBitcast(VT, Result);
7112}
7113
7114/// Return a vector logical shift node.
7115static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
7116 SelectionDAG &DAG, const TargetLowering &TLI,
7117 const SDLoc &dl) {
7118 assert(VT.is128BitVector() && "Unknown type for VShift");
7119 MVT ShVT = MVT::v16i8;
7120 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7121 SrcOp = DAG.getBitcast(ShVT, SrcOp);
7122 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
7123 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
7124 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
7125}
7126
7128 SelectionDAG &DAG) {
7129
7130 // Check if the scalar load can be widened into a vector load. And if
7131 // the address is "base + cst" see if the cst can be "absorbed" into
7132 // the shuffle mask.
7134 SDValue Ptr = LD->getBasePtr();
7135 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7136 return SDValue();
7137 EVT PVT = LD->getValueType(0);
7138 if (PVT != MVT::i32 && PVT != MVT::f32)
7139 return SDValue();
7140
7141 int FI = -1;
7142 int64_t Offset = 0;
7144 FI = FINode->getIndex();
7145 Offset = 0;
7146 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7147 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
7148 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7149 Offset = Ptr.getConstantOperandVal(1);
7150 Ptr = Ptr.getOperand(0);
7151 } else {
7152 return SDValue();
7153 }
7154
7155 // FIXME: 256-bit vector instructions don't require a strict alignment,
7156 // improve this code to support it better.
7157 Align RequiredAlign(VT.getSizeInBits() / 8);
7158 SDValue Chain = LD->getChain();
7159 // Make sure the stack object alignment is at least 16 or 32.
7161 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
7162 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7163 if (MFI.isFixedObjectIndex(FI)) {
7164 // Can't change the alignment. FIXME: It's possible to compute
7165 // the exact stack offset and reference FI + adjust offset instead.
7166 // If someone *really* cares about this. That's the way to implement it.
7167 return SDValue();
7168 } else {
7169 MFI.setObjectAlignment(FI, RequiredAlign);
7170 }
7171 }
7172
7173 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7174 // Ptr + (Offset & ~15).
7175 if (Offset < 0)
7176 return SDValue();
7177 if ((Offset % RequiredAlign.value()) & 3)
7178 return SDValue();
7179 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7180 if (StartOffset) {
7181 SDLoc DL(Ptr);
7182 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7183 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7184 }
7185
7186 int EltNo = (Offset - StartOffset) >> 2;
7187 unsigned NumElems = VT.getVectorNumElements();
7188
7189 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7190 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7191 LD->getPointerInfo().getWithOffset(StartOffset));
7192
7193 SmallVector<int, 8> Mask(NumElems, EltNo);
7194
7195 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7196 }
7197
7198 return SDValue();
7199}
7200
7201// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7202static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7203 if (ISD::isNON_EXTLoad(Elt.getNode())) {
7204 auto *BaseLd = cast<LoadSDNode>(Elt);
7205 if (!BaseLd->isSimple())
7206 return false;
7207 Ld = BaseLd;
7208 ByteOffset = 0;
7209 return true;
7210 }
7211
7212 switch (Elt.getOpcode()) {
7213 case ISD::BITCAST:
7214 case ISD::TRUNCATE:
7216 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
7217 case ISD::SRL:
7218 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7219 uint64_t Amt = AmtC->getZExtValue();
7220 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
7221 ByteOffset += Amt / 8;
7222 return true;
7223 }
7224 }
7225 break;
7227 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7228 SDValue Src = Elt.getOperand(0);
7229 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7230 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
7231 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7232 findEltLoadSrc(Src, Ld, ByteOffset)) {
7233 uint64_t Idx = IdxC->getZExtValue();
7234 ByteOffset += Idx * (SrcSizeInBits / 8);
7235 return true;
7236 }
7237 }
7238 break;
7239 }
7240
7241 return false;
7242}
7243
7244/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7245/// elements can be replaced by a single large load which has the same value as
7246/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7247///
7248/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7250 const SDLoc &DL, SelectionDAG &DAG,
7251 const X86Subtarget &Subtarget,
7252 bool IsAfterLegalize) {
7253 if ((VT.getScalarSizeInBits() % 8) != 0)
7254 return SDValue();
7255
7256 unsigned NumElems = Elts.size();
7257
7258 int LastLoadedElt = -1;
7259 APInt LoadMask = APInt::getZero(NumElems);
7260 APInt ZeroMask = APInt::getZero(NumElems);
7261 APInt UndefMask = APInt::getZero(NumElems);
7262
7263 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7264 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
7265
7266 // For each element in the initializer, see if we've found a load, zero or an
7267 // undef.
7268 for (unsigned i = 0; i < NumElems; ++i) {
7269 SDValue Elt = peekThroughBitcasts(Elts[i]);
7270 if (!Elt.getNode())
7271 return SDValue();
7272 if (Elt.isUndef()) {
7273 UndefMask.setBit(i);
7274 continue;
7275 }
7277 ZeroMask.setBit(i);
7278 continue;
7279 }
7280
7281 // Each loaded element must be the correct fractional portion of the
7282 // requested vector load.
7283 unsigned EltSizeInBits = Elt.getValueSizeInBits();
7284 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
7285 return SDValue();
7286
7287 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7288 return SDValue();
7289 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7290 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7291 return SDValue();
7292
7293 LoadMask.setBit(i);
7294 LastLoadedElt = i;
7295 }
7296 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
7297 NumElems &&
7298 "Incomplete element masks");
7299
7300 // Handle Special Cases - all undef or undef/zero.
7301 if (UndefMask.popcount() == NumElems)
7302 return DAG.getUNDEF(VT);
7303 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
7304 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7305 : DAG.getConstantFP(0.0, DL, VT);
7306
7307 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7308 int FirstLoadedElt = LoadMask.countr_zero();
7309 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7310 EVT EltBaseVT = EltBase.getValueType();
7311 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
7312 "Register/Memory size mismatch");
7313 LoadSDNode *LDBase = Loads[FirstLoadedElt];
7314 assert(LDBase && "Did not find base load for merging consecutive loads");
7315 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
7316 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7317 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7318 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7319 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7320
7321 // TODO: Support offsetting the base load.
7322 if (ByteOffsets[FirstLoadedElt] != 0)
7323 return SDValue();
7324
7325 // Check to see if the element's load is consecutive to the base load
7326 // or offset from a previous (already checked) load.
7327 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7328 LoadSDNode *Ld = Loads[EltIdx];
7329 int64_t ByteOffset = ByteOffsets[EltIdx];
7330 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7331 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7332 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7333 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7334 }
7335 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
7336 EltIdx - FirstLoadedElt);
7337 };
7338
7339 // Consecutive loads can contain UNDEFS but not ZERO elements.
7340 // Consecutive loads with UNDEFs and ZEROs elements require a
7341 // an additional shuffle stage to clear the ZERO elements.
7342 bool IsConsecutiveLoad = true;
7343 bool IsConsecutiveLoadWithZeros = true;
7344 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7345 if (LoadMask[i]) {
7346 if (!CheckConsecutiveLoad(LDBase, i)) {
7347 IsConsecutiveLoad = false;
7348 IsConsecutiveLoadWithZeros = false;
7349 break;
7350 }
7351 } else if (ZeroMask[i]) {
7352 IsConsecutiveLoad = false;
7353 }
7354 }
7355
7356 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7357 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7358 assert(LDBase->isSimple() &&
7359 "Cannot merge volatile or atomic loads.");
7360 SDValue NewLd =
7361 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7362 LDBase->getPointerInfo(), LDBase->getBaseAlign(), MMOFlags);
7363 for (auto *LD : Loads)
7364 if (LD)
7365 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7366 return NewLd;
7367 };
7368
7369 // Check if the base load is entirely dereferenceable.
7370 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7371 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7372
7373 // LOAD - all consecutive load/undefs (must start/end with a load or be
7374 // entirely dereferenceable). If we have found an entire vector of loads and
7375 // undefs, then return a large load of the entire vector width starting at the
7376 // base pointer. If the vector contains zeros, then attempt to shuffle those
7377 // elements.
7378 if (FirstLoadedElt == 0 &&
7379 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7380 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7381 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7382 return SDValue();
7383
7384 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7385 // will lower to regular temporal loads and use the cache.
7386 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7387 VT.is256BitVector() && !Subtarget.hasInt256())
7388 return SDValue();
7389
7390 if (NumElems == 1)
7391 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7392
7393 if (!ZeroMask)
7394 return CreateLoad(VT, LDBase);
7395
7396 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7397 // vector and a zero vector to clear out the zero elements.
7398 if (!IsAfterLegalize && VT.isVector()) {
7399 unsigned NumMaskElts = VT.getVectorNumElements();
7400 if ((NumMaskElts % NumElems) == 0) {
7401 unsigned Scale = NumMaskElts / NumElems;
7402 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7403 for (unsigned i = 0; i < NumElems; ++i) {
7404 if (UndefMask[i])
7405 continue;
7406 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7407 for (unsigned j = 0; j != Scale; ++j)
7408 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7409 }
7410 SDValue V = CreateLoad(VT, LDBase);
7411 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7412 : DAG.getConstantFP(0.0, DL, VT);
7413 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7414 }
7415 }
7416 }
7417
7418 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7419 if (VT.is256BitVector() || VT.is512BitVector()) {
7420 unsigned HalfNumElems = NumElems / 2;
7421 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7422 EVT HalfVT =
7423 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7424 SDValue HalfLD =
7425 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7426 DAG, Subtarget, IsAfterLegalize);
7427 if (HalfLD)
7428 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7429 HalfLD, DAG.getVectorIdxConstant(0, DL));
7430 }
7431 }
7432
7433 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7434 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7435 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7436 LoadSizeInBits == 64) &&
7437 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7438 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7439 : MVT::getIntegerVT(LoadSizeInBits);
7440 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7441 // Allow v4f32 on SSE1 only targets.
7442 // FIXME: Add more isel patterns so we can just use VT directly.
7443 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7444 VecVT = MVT::v4f32;
7445 if (TLI.isTypeLegal(VecVT)) {
7446 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7447 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7448 SDValue ResNode = DAG.getMemIntrinsicNode(
7449 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7451 for (auto *LD : Loads)
7452 if (LD)
7453 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7454 return DAG.getBitcast(VT, ResNode);
7455 }
7456 }
7457
7458 // BROADCAST - match the smallest possible repetition pattern, load that
7459 // scalar/subvector element and then broadcast to the entire vector.
7460 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7461 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7462 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7463 unsigned RepeatSize = SubElems * BaseSizeInBits;
7464 unsigned ScalarSize = std::min(RepeatSize, 64u);
7465 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7466 continue;
7467
7468 // Don't attempt a 1:N subvector broadcast - it should be caught by
7469 // combineConcatVectorOps, else will cause infinite loops.
7470 if (RepeatSize > ScalarSize && SubElems == 1)
7471 continue;
7472
7473 bool Match = true;
7474 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7475 for (unsigned i = 0; i != NumElems && Match; ++i) {
7476 if (!LoadMask[i])
7477 continue;
7478 SDValue Elt = peekThroughBitcasts(Elts[i]);
7479 if (RepeatedLoads[i % SubElems].isUndef())
7480 RepeatedLoads[i % SubElems] = Elt;
7481 else
7482 Match &= (RepeatedLoads[i % SubElems] == Elt);
7483 }
7484
7485 // We must have loads at both ends of the repetition.
7486 Match &= !RepeatedLoads.front().isUndef();
7487 Match &= !RepeatedLoads.back().isUndef();
7488 if (!Match)
7489 continue;
7490
7491 EVT RepeatVT =
7492 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7493 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7494 : EVT::getFloatingPointVT(ScalarSize);
7495 if (RepeatSize > ScalarSize)
7496 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7497 RepeatSize / ScalarSize);
7498 EVT BroadcastVT =
7499 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7500 VT.getSizeInBits() / ScalarSize);
7501 if (TLI.isTypeLegal(BroadcastVT)) {
7502 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7503 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
7504 SDValue Broadcast = RepeatLoad;
7505 if (RepeatSize > ScalarSize) {
7506 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7507 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7508 } else {
7509 if (!Subtarget.hasAVX2() &&
7511 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7512 Subtarget,
7513 /*AssumeSingleUse=*/true))
7514 return SDValue();
7515 Broadcast =
7516 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7517 }
7518 return DAG.getBitcast(VT, Broadcast);
7519 }
7520 }
7521 }
7522 }
7523
7524 return SDValue();
7525}
7526
7527// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7528// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7529// are consecutive, non-overlapping, and in the right order.
7531 SelectionDAG &DAG,
7532 const X86Subtarget &Subtarget,
7533 bool IsAfterLegalize) {
7535 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7536 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7537 Elts.push_back(Elt);
7538 continue;
7539 }
7540 return SDValue();
7541 }
7542 assert(Elts.size() == VT.getVectorNumElements());
7543 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7544 IsAfterLegalize);
7545}
7546
7548 const APInt &Undefs, LLVMContext &C) {
7549 unsigned ScalarSize = VT.getScalarSizeInBits();
7550 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7551
7552 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7553 if (VT.isFloatingPoint()) {
7554 if (ScalarSize == 16)
7555 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7556 if (ScalarSize == 32)
7557 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7558 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7559 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7560 }
7561 return Constant::getIntegerValue(Ty, Val);
7562 };
7563
7564 SmallVector<Constant *, 32> ConstantVec;
7565 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7566 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7567 : getConstantScalar(Bits[I]));
7568
7569 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7570}
7571
7572static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7573 unsigned SplatBitSize, LLVMContext &C) {
7574 unsigned ScalarSize = VT.getScalarSizeInBits();
7575
7576 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7577 if (VT.isFloatingPoint()) {
7578 if (ScalarSize == 16)
7579 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7580 if (ScalarSize == 32)
7581 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7582 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7583 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7584 }
7585 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7586 };
7587
7588 if (ScalarSize == SplatBitSize)
7589 return getConstantScalar(SplatValue);
7590
7591 unsigned NumElm = SplatBitSize / ScalarSize;
7592 SmallVector<Constant *, 32> ConstantVec;
7593 for (unsigned I = 0; I != NumElm; ++I) {
7594 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7595 ConstantVec.push_back(getConstantScalar(Val));
7596 }
7597 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7598}
7599
7601 for (auto *U : N->users()) {
7602 unsigned Opc = U->getOpcode();
7603 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7604 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7605 return false;
7606 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7607 return false;
7608 if (isTargetShuffle(Opc))
7609 return true;
7610 if (Opc == ISD::BITCAST) // Ignore bitcasts
7611 return isFoldableUseOfShuffle(U);
7612 if (N->hasOneUse()) {
7613 // TODO, there may be some general way to know if a SDNode can
7614 // be folded. We now only know whether an MI is foldable.
7615 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7616 return false;
7617 return true;
7618 }
7619 }
7620 return false;
7621}
7622
7623// If the node has a single use by a VSELECT then AVX512 targets may be able to
7624// fold as a predicated instruction.
7625static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget) {
7626 unsigned SizeInBits = V.getValueSizeInBits();
7627 if ((SizeInBits == 512 && Subtarget.hasAVX512()) ||
7628 (SizeInBits >= 128 && Subtarget.hasVLX())) {
7629 if (V.hasOneUse() && V->user_begin()->getOpcode() == ISD::VSELECT &&
7630 V->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
7631 return true;
7632 }
7633 }
7634 return false;
7635}
7636
7637/// Attempt to use the vbroadcast instruction to generate a splat value
7638/// from a splat BUILD_VECTOR which uses:
7639/// a. A single scalar load, or a constant.
7640/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7641///
7642/// The VBROADCAST node is returned when a pattern is found,
7643/// or SDValue() otherwise.
7645 const SDLoc &dl,
7646 const X86Subtarget &Subtarget,
7647 SelectionDAG &DAG) {
7648 // VBROADCAST requires AVX.
7649 // TODO: Splats could be generated for non-AVX CPUs using SSE
7650 // instructions, but there's less potential gain for only 128-bit vectors.
7651 if (!Subtarget.hasAVX())
7652 return SDValue();
7653
7654 MVT VT = BVOp->getSimpleValueType(0);
7655 unsigned NumElts = VT.getVectorNumElements();
7656 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7657 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7658 "Unsupported vector type for broadcast.");
7659
7660 // See if the build vector is a repeating sequence of scalars (inc. splat).
7661 SDValue Ld;
7662 BitVector UndefElements;
7663 SmallVector<SDValue, 16> Sequence;
7664 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7665 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7666 if (Sequence.size() == 1)
7667 Ld = Sequence[0];
7668 }
7669
7670 // Attempt to use VBROADCASTM
7671 // From this pattern:
7672 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7673 // b. t1 = (build_vector t0 t0)
7674 //
7675 // Create (VBROADCASTM v2i1 X)
7676 if (!Sequence.empty() && Subtarget.hasCDI()) {
7677 // If not a splat, are the upper sequence values zeroable?
7678 unsigned SeqLen = Sequence.size();
7679 bool UpperZeroOrUndef =
7680 SeqLen == 1 ||
7681 llvm::all_of(ArrayRef(Sequence).drop_front(),
7682 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
7683 SDValue Op0 = Sequence[0];
7684 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7685 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7686 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7687 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7688 ? Op0.getOperand(0)
7689 : Op0.getOperand(0).getOperand(0);
7690 MVT MaskVT = BOperand.getSimpleValueType();
7691 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7692 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7693 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7694 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7695 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7696 unsigned Scale = 512 / VT.getSizeInBits();
7697 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7698 }
7699 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7700 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7701 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7702 return DAG.getBitcast(VT, Bcst);
7703 }
7704 }
7705 }
7706
7707 unsigned NumUndefElts = UndefElements.count();
7708 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7709 APInt SplatValue, Undef;
7710 unsigned SplatBitSize;
7711 bool HasUndef;
7712 // Check if this is a repeated constant pattern suitable for broadcasting.
7713 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7714 SplatBitSize > VT.getScalarSizeInBits() &&
7715 SplatBitSize < VT.getSizeInBits()) {
7716 // Avoid replacing with broadcast when it's a use of a shuffle
7717 // instruction to preserve the present custom lowering of shuffles.
7718 if (isFoldableUseOfShuffle(BVOp))
7719 return SDValue();
7720 // replace BUILD_VECTOR with broadcast of the repeated constants.
7721 LLVMContext *Ctx = DAG.getContext();
7722 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7723 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7724 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7725 // Load the constant scalar/subvector and broadcast it.
7726 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7727 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7728 SDValue CP = DAG.getConstantPool(C, PVT);
7729 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7730
7731 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7732 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7733 SDValue Ops[] = {DAG.getEntryNode(), CP};
7734 MachinePointerInfo MPI =
7736 SDValue Brdcst =
7738 MPI, Alignment, MachineMemOperand::MOLoad);
7739 return DAG.getBitcast(VT, Brdcst);
7740 }
7741 if (SplatBitSize > 64) {
7742 // Load the vector of constants and broadcast it.
7743 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7744 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7745 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7746 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7747 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7748 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7749 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7750 MachinePointerInfo MPI =
7753 Ops, VVT, MPI, Alignment,
7755 }
7756 }
7757
7758 // If we are moving a scalar into a vector (Ld must be set and all elements
7759 // but 1 are undef) and that operation is not obviously supported by
7760 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7761 // That's better than general shuffling and may eliminate a load to GPR and
7762 // move from scalar to vector register.
7763 if (!Ld || NumElts - NumUndefElts != 1)
7764 return SDValue();
7765 unsigned ScalarSize = Ld.getValueSizeInBits();
7766 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7767 return SDValue();
7768 }
7769
7770 bool ConstSplatVal =
7771 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7772 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7773
7774 // TODO: Handle broadcasts of non-constant sequences.
7775
7776 // Make sure that all of the users of a non-constant load are from the
7777 // BUILD_VECTOR node.
7778 // FIXME: Is the use count needed for non-constant, non-load case?
7779 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7780 return SDValue();
7781
7782 unsigned ScalarSize = Ld.getValueSizeInBits();
7783 bool IsGE256 = (VT.getSizeInBits() >= 256);
7784
7785 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7786 // instruction to save 8 or more bytes of constant pool data.
7787 // TODO: If multiple splats are generated to load the same constant,
7788 // it may be detrimental to overall size. There needs to be a way to detect
7789 // that condition to know if this is truly a size win.
7790 bool OptForSize = DAG.shouldOptForSize();
7791
7792 // Handle broadcasting a single constant scalar from the constant pool
7793 // into a vector.
7794 // On Sandybridge (no AVX2), it is still better to load a constant vector
7795 // from the constant pool and not to broadcast it from a scalar.
7796 // But override that restriction when optimizing for size.
7797 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7798 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7799 EVT CVT = Ld.getValueType();
7800 assert(!CVT.isVector() && "Must not broadcast a vector type");
7801
7802 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7803 // For size optimization, also splat v2f64 and v2i64, and for size opt
7804 // with AVX2, also splat i8 and i16.
7805 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7806 if (ScalarSize == 32 ||
7807 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7808 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
7809 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7810 const Constant *C = nullptr;
7812 C = CI->getConstantIntValue();
7814 C = CF->getConstantFPValue();
7815
7816 assert(C && "Invalid constant type");
7817
7818 SDValue CP =
7820 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7821
7822 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7823 SDValue Ops[] = {DAG.getEntryNode(), CP};
7824 MachinePointerInfo MPI =
7826 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7827 MPI, Alignment, MachineMemOperand::MOLoad);
7828 }
7829 }
7830
7831 // Handle AVX2 in-register broadcasts.
7832 if (!IsLoad && Subtarget.hasInt256() &&
7833 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7834 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7835
7836 // The scalar source must be a normal load.
7837 if (!IsLoad)
7838 return SDValue();
7839
7840 // Make sure the non-chain result is only used by this build vector.
7841 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7842 return SDValue();
7843
7844 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7845 (Subtarget.hasVLX() && ScalarSize == 64)) {
7846 auto *LN = cast<LoadSDNode>(Ld);
7847 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7848 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7849 SDValue BCast =
7851 LN->getMemoryVT(), LN->getMemOperand());
7852 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7853 return BCast;
7854 }
7855
7856 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7857 // double since there is no vbroadcastsd xmm
7858 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7859 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7860 auto *LN = cast<LoadSDNode>(Ld);
7861 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7862 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7863 SDValue BCast =
7865 LN->getMemoryVT(), LN->getMemOperand());
7866 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7867 return BCast;
7868 }
7869
7870 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7871 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7872
7873 // Unsupported broadcast.
7874 return SDValue();
7875}
7876
7877/// For an EXTRACT_VECTOR_ELT with a constant index return the real
7878/// underlying vector and index.
7879///
7880/// Modifies \p ExtractedFromVec to the real vector and returns the real
7881/// index.
7882static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7883 SDValue ExtIdx) {
7884 int Idx = ExtIdx->getAsZExtVal();
7885 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7886 return Idx;
7887
7888 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7889 // lowered this:
7890 // (extract_vector_elt (v8f32 %1), Constant<6>)
7891 // to:
7892 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7893 // (extract_subvector (v8f32 %0), Constant<4>),
7894 // undef)
7895 // Constant<0>)
7896 // In this case the vector is the extract_subvector expression and the index
7897 // is 2, as specified by the shuffle.
7898 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7899 SDValue ShuffleVec = SVOp->getOperand(0);
7900 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7901 assert(ShuffleVecVT.getVectorElementType() ==
7902 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7903
7904 int ShuffleIdx = SVOp->getMaskElt(Idx);
7905 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7906 ExtractedFromVec = ShuffleVec;
7907 return ShuffleIdx;
7908 }
7909 return Idx;
7910}
7911
7913 SelectionDAG &DAG) {
7914 MVT VT = Op.getSimpleValueType();
7915
7916 // Skip if insert_vec_elt is not supported.
7917 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7919 return SDValue();
7920
7921 unsigned NumElems = Op.getNumOperands();
7922 SDValue VecIn1;
7923 SDValue VecIn2;
7924 SmallVector<unsigned, 4> InsertIndices;
7925 SmallVector<int, 8> Mask(NumElems, -1);
7926
7927 for (unsigned i = 0; i != NumElems; ++i) {
7928 unsigned Opc = Op.getOperand(i).getOpcode();
7929
7930 if (Opc == ISD::UNDEF)
7931 continue;
7932
7934 // Quit if more than 1 elements need inserting.
7935 if (InsertIndices.size() > 1)
7936 return SDValue();
7937
7938 InsertIndices.push_back(i);
7939 continue;
7940 }
7941
7942 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7943 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7944
7945 // Quit if non-constant index.
7946 if (!isa<ConstantSDNode>(ExtIdx))
7947 return SDValue();
7948 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7949
7950 // Quit if extracted from vector of different type.
7951 if (ExtractedFromVec.getValueType() != VT)
7952 return SDValue();
7953
7954 if (!VecIn1.getNode())
7955 VecIn1 = ExtractedFromVec;
7956 else if (VecIn1 != ExtractedFromVec) {
7957 if (!VecIn2.getNode())
7958 VecIn2 = ExtractedFromVec;
7959 else if (VecIn2 != ExtractedFromVec)
7960 // Quit if more than 2 vectors to shuffle
7961 return SDValue();
7962 }
7963
7964 if (ExtractedFromVec == VecIn1)
7965 Mask[i] = Idx;
7966 else if (ExtractedFromVec == VecIn2)
7967 Mask[i] = Idx + NumElems;
7968 }
7969
7970 if (!VecIn1.getNode())
7971 return SDValue();
7972
7973 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7974 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7975
7976 for (unsigned Idx : InsertIndices)
7977 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7978 DAG.getVectorIdxConstant(Idx, DL));
7979
7980 return NV;
7981}
7982
7983// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7985 const X86Subtarget &Subtarget) {
7986 MVT VT = Op.getSimpleValueType();
7987 MVT IVT =
7988 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
7990 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
7991 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
7992 Op.getOperand(I)));
7993 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
7994 return DAG.getBitcast(VT, Res);
7995}
7996
7997// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7999 SelectionDAG &DAG,
8000 const X86Subtarget &Subtarget) {
8001
8002 MVT VT = Op.getSimpleValueType();
8003 assert((VT.getVectorElementType() == MVT::i1) &&
8004 "Unexpected type in LowerBUILD_VECTORvXi1!");
8005 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
8006 ISD::isBuildVectorAllOnes(Op.getNode()))
8007 return Op;
8008
8009 uint64_t Immediate = 0;
8010 SmallVector<unsigned, 16> NonConstIdx;
8011 bool IsSplat = true;
8012 bool HasConstElts = false;
8013 int SplatIdx = -1;
8014 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
8015 SDValue In = Op.getOperand(idx);
8016 if (In.isUndef())
8017 continue;
8018 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
8019 Immediate |= (InC->getZExtValue() & 0x1) << idx;
8020 HasConstElts = true;
8021 } else {
8022 NonConstIdx.push_back(idx);
8023 }
8024 if (SplatIdx < 0)
8025 SplatIdx = idx;
8026 else if (In != Op.getOperand(SplatIdx))
8027 IsSplat = false;
8028 }
8029
8030 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
8031 if (IsSplat) {
8032 // The build_vector allows the scalar element to be larger than the vector
8033 // element type. We need to mask it to use as a condition unless we know
8034 // the upper bits are zero.
8035 // FIXME: Use computeKnownBits instead of checking specific opcode?
8036 SDValue Cond = Op.getOperand(SplatIdx);
8037 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
8038 if (Cond.getOpcode() != ISD::SETCC)
8039 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
8040 DAG.getConstant(1, dl, MVT::i8));
8041
8042 // Perform the select in the scalar domain so we can use cmov.
8043 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8044 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
8045 DAG.getAllOnesConstant(dl, MVT::i32),
8046 DAG.getConstant(0, dl, MVT::i32));
8047 Select = DAG.getBitcast(MVT::v32i1, Select);
8048 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
8049 } else {
8050 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8051 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
8052 DAG.getAllOnesConstant(dl, ImmVT),
8053 DAG.getConstant(0, dl, ImmVT));
8054 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8055 Select = DAG.getBitcast(VecVT, Select);
8056 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
8057 DAG.getVectorIdxConstant(0, dl));
8058 }
8059 }
8060
8061 // insert elements one by one
8062 SDValue DstVec;
8063 if (HasConstElts) {
8064 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8065 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
8066 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
8067 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
8068 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
8069 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
8070 } else {
8071 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8072 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
8073 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8074 DstVec = DAG.getBitcast(VecVT, Imm);
8075 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
8076 DAG.getVectorIdxConstant(0, dl));
8077 }
8078 } else
8079 DstVec = DAG.getUNDEF(VT);
8080
8081 for (unsigned InsertIdx : NonConstIdx) {
8082 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
8083 Op.getOperand(InsertIdx),
8084 DAG.getVectorIdxConstant(InsertIdx, dl));
8085 }
8086 return DstVec;
8087}
8088
8089LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
8090 switch (Opcode) {
8091 case X86ISD::PACKSS:
8092 case X86ISD::PACKUS:
8093 case X86ISD::FHADD:
8094 case X86ISD::FHSUB:
8095 case X86ISD::HADD:
8096 case X86ISD::HSUB:
8097 return true;
8098 }
8099 return false;
8100}
8101
8102/// This is a helper function of LowerToHorizontalOp().
8103/// This function checks that the build_vector \p N in input implements a
8104/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
8105/// may not match the layout of an x86 256-bit horizontal instruction.
8106/// In other words, if this returns true, then some extraction/insertion will
8107/// be required to produce a valid horizontal instruction.
8108///
8109/// Parameter \p Opcode defines the kind of horizontal operation to match.
8110/// For example, if \p Opcode is equal to ISD::ADD, then this function
8111/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
8112/// is equal to ISD::SUB, then this function checks if this is a horizontal
8113/// arithmetic sub.
8114///
8115/// This function only analyzes elements of \p N whose indices are
8116/// in range [BaseIdx, LastIdx).
8117///
8118/// TODO: This function was originally used to match both real and fake partial
8119/// horizontal operations, but the index-matching logic is incorrect for that.
8120/// See the corrected implementation in isHopBuildVector(). Can we reduce this
8121/// code because it is only used for partial h-op matching now?
8122static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
8123 const SDLoc &DL, SelectionDAG &DAG,
8124 unsigned BaseIdx, unsigned LastIdx,
8125 SDValue &V0, SDValue &V1) {
8126 EVT VT = N->getValueType(0);
8127 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
8128 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
8129 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
8130 "Invalid Vector in input!");
8131
8132 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
8133 bool CanFold = true;
8134 unsigned ExpectedVExtractIdx = BaseIdx;
8135 unsigned NumElts = LastIdx - BaseIdx;
8136 V0 = DAG.getUNDEF(VT);
8137 V1 = DAG.getUNDEF(VT);
8138
8139 // Check if N implements a horizontal binop.
8140 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8141 SDValue Op = N->getOperand(i + BaseIdx);
8142
8143 // Skip UNDEFs.
8144 if (Op->isUndef()) {
8145 // Update the expected vector extract index.
8146 if (i * 2 == NumElts)
8147 ExpectedVExtractIdx = BaseIdx;
8148 ExpectedVExtractIdx += 2;
8149 continue;
8150 }
8151
8152 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8153
8154 if (!CanFold)
8155 break;
8156
8157 SDValue Op0 = Op.getOperand(0);
8158 SDValue Op1 = Op.getOperand(1);
8159
8160 // Try to match the following pattern:
8161 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8162 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8164 Op0.getOperand(0) == Op1.getOperand(0) &&
8167 if (!CanFold)
8168 break;
8169
8170 unsigned I0 = Op0.getConstantOperandVal(1);
8171 unsigned I1 = Op1.getConstantOperandVal(1);
8172
8173 if (i * 2 < NumElts) {
8174 if (V0.isUndef()) {
8175 V0 = Op0.getOperand(0);
8176 if (V0.getValueType() != VT)
8177 return false;
8178 }
8179 } else {
8180 if (V1.isUndef()) {
8181 V1 = Op0.getOperand(0);
8182 if (V1.getValueType() != VT)
8183 return false;
8184 }
8185 if (i * 2 == NumElts)
8186 ExpectedVExtractIdx = BaseIdx;
8187 }
8188
8189 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8190 if (I0 == ExpectedVExtractIdx)
8191 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8192 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8193 // Try to match the following dag sequence:
8194 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8195 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8196 } else
8197 CanFold = false;
8198
8199 ExpectedVExtractIdx += 2;
8200 }
8201
8202 return CanFold;
8203}
8204
8205/// Emit a sequence of two 128-bit horizontal add/sub followed by
8206/// a concat_vector.
8207///
8208/// This is a helper function of LowerToHorizontalOp().
8209/// This function expects two 256-bit vectors called V0 and V1.
8210/// At first, each vector is split into two separate 128-bit vectors.
8211/// Then, the resulting 128-bit vectors are used to implement two
8212/// horizontal binary operations.
8213///
8214/// The kind of horizontal binary operation is defined by \p X86Opcode.
8215///
8216/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8217/// the two new horizontal binop.
8218/// When Mode is set, the first horizontal binop dag node would take as input
8219/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8220/// horizontal binop dag node would take as input the lower 128-bit of V1
8221/// and the upper 128-bit of V1.
8222/// Example:
8223/// HADD V0_LO, V0_HI
8224/// HADD V1_LO, V1_HI
8225///
8226/// Otherwise, the first horizontal binop dag node takes as input the lower
8227/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8228/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8229/// Example:
8230/// HADD V0_LO, V1_LO
8231/// HADD V0_HI, V1_HI
8232///
8233/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
8234/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8235/// the upper 128-bits of the result.
8236static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
8237 const SDLoc &DL, SelectionDAG &DAG,
8238 unsigned X86Opcode, bool Mode,
8239 bool isUndefLO, bool isUndefHI) {
8240 MVT VT = V0.getSimpleValueType();
8241 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
8242 "Invalid nodes in input!");
8243
8244 unsigned NumElts = VT.getVectorNumElements();
8245 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8246 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8247 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8248 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8249 MVT NewVT = V0_LO.getSimpleValueType();
8250
8251 SDValue LO = DAG.getUNDEF(NewVT);
8252 SDValue HI = DAG.getUNDEF(NewVT);
8253
8254 if (Mode) {
8255 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8256 if (!isUndefLO && !V0->isUndef())
8257 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8258 if (!isUndefHI && !V1->isUndef())
8259 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8260 } else {
8261 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8262 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8263 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8264
8265 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8266 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8267 }
8268
8269 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8270}
8271
8272/// Returns true iff \p BV builds a vector with the result equivalent to
8273/// the result of ADDSUB/SUBADD operation.
8274/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8275/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8276/// \p Opnd0 and \p Opnd1.
8278 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8279 SDValue &Opnd0, SDValue &Opnd1,
8280 unsigned &NumExtracts, bool &IsSubAdd,
8281 bool &HasAllowContract) {
8282 using namespace SDPatternMatch;
8283
8284 MVT VT = BV->getSimpleValueType(0);
8285 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8286 return false;
8287
8288 unsigned NumElts = VT.getVectorNumElements();
8289 SDValue InVec0 = DAG.getUNDEF(VT);
8290 SDValue InVec1 = DAG.getUNDEF(VT);
8291
8292 NumExtracts = 0;
8293 HasAllowContract = NumElts != 0;
8294
8295 // Odd-numbered elements in the input build vector are obtained from
8296 // adding/subtracting two integer/float elements.
8297 // Even-numbered elements in the input build vector are obtained from
8298 // subtracting/adding two integer/float elements.
8299 unsigned Opc[2] = {0, 0};
8300 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8301 SDValue Op = BV->getOperand(i);
8302
8303 // Skip 'undef' values.
8304 unsigned Opcode = Op.getOpcode();
8305 if (Opcode == ISD::UNDEF)
8306 continue;
8307
8308 // Early exit if we found an unexpected opcode.
8309 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8310 return false;
8311
8312 SDValue Op0 = Op.getOperand(0);
8313 SDValue Op1 = Op.getOperand(1);
8314
8315 // Try to match the following pattern:
8316 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8317 // Early exit if we cannot match that sequence.
8318 if (!sd_match(Op0, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))) ||
8319 !sd_match(Op1, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))))
8320 return false;
8321
8322 // We found a valid add/sub node, make sure its the same opcode as previous
8323 // elements for this parity.
8324 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8325 return false;
8326 Opc[i % 2] = Opcode;
8327
8328 // Update InVec0 and InVec1.
8329 if (InVec0.isUndef())
8330 InVec0 = Op0.getOperand(0);
8331 if (InVec1.isUndef())
8332 InVec1 = Op1.getOperand(0);
8333
8334 // Make sure that operands in input to each add/sub node always
8335 // come from a same pair of vectors.
8336 if (InVec0 != Op0.getOperand(0)) {
8337 if (Opcode == ISD::FSUB)
8338 return false;
8339
8340 // FADD is commutable. Try to commute the operands
8341 // and then test again.
8342 std::swap(Op0, Op1);
8343 if (InVec0 != Op0.getOperand(0))
8344 return false;
8345 }
8346
8347 if (InVec1 != Op1.getOperand(0))
8348 return false;
8349
8350 // Increment the number of extractions done.
8351 ++NumExtracts;
8352 HasAllowContract &= Op->getFlags().hasAllowContract();
8353 }
8354
8355 // Ensure we have found an opcode for both parities and that they are
8356 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8357 // inputs are undef.
8358 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8359 InVec0.isUndef() || InVec1.isUndef())
8360 return false;
8361
8362 IsSubAdd = Opc[0] == ISD::FADD;
8363
8364 Opnd0 = InVec0;
8365 Opnd1 = InVec1;
8366 return true;
8367}
8368
8369/// Returns true if is possible to fold MUL and an idiom that has already been
8370/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8371/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8372/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8373///
8374/// Prior to calling this function it should be known that there is some
8375/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8376/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8377/// before replacement of such SDNode with ADDSUB operation. Thus the number
8378/// of \p Opnd0 uses is expected to be equal to 2.
8379/// For example, this function may be called for the following IR:
8380/// %AB = fmul fast <2 x double> %A, %B
8381/// %Sub = fsub fast <2 x double> %AB, %C
8382/// %Add = fadd fast <2 x double> %AB, %C
8383/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8384/// <2 x i32> <i32 0, i32 3>
8385/// There is a def for %Addsub here, which potentially can be replaced by
8386/// X86ISD::ADDSUB operation:
8387/// %Addsub = X86ISD::ADDSUB %AB, %C
8388/// and such ADDSUB can further be replaced with FMADDSUB:
8389/// %Addsub = FMADDSUB %A, %B, %C.
8390///
8391/// The main reason why this method is called before the replacement of the
8392/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8393/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8394/// FMADDSUB is.
8395static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8396 SelectionDAG &DAG, SDValue &Opnd0,
8397 SDValue &Opnd1, SDValue &Opnd2,
8398 unsigned ExpectedUses,
8399 bool AllowSubAddOrAddSubContract) {
8400 if (Opnd0.getOpcode() != ISD::FMUL ||
8401 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8402 return false;
8403
8404 // FIXME: These checks must match the similar ones in
8405 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8406 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8407 // or MUL + ADDSUB to FMADDSUB.
8408 const TargetOptions &Options = DAG.getTarget().Options;
8409 bool AllowFusion =
8410 Options.AllowFPOpFusion == FPOpFusion::Fast ||
8411 (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract());
8412 if (!AllowFusion)
8413 return false;
8414
8415 Opnd2 = Opnd1;
8416 Opnd1 = Opnd0.getOperand(1);
8417 Opnd0 = Opnd0.getOperand(0);
8418
8419 return true;
8420}
8421
8422/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8423/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8424/// X86ISD::FMSUBADD node.
8426 const SDLoc &DL,
8427 const X86Subtarget &Subtarget,
8428 SelectionDAG &DAG) {
8429 SDValue Opnd0, Opnd1;
8430 unsigned NumExtracts;
8431 bool IsSubAdd;
8432 bool HasAllowContract;
8433 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd,
8434 HasAllowContract))
8435 return SDValue();
8436
8437 MVT VT = BV->getSimpleValueType(0);
8438
8439 // Try to generate X86ISD::FMADDSUB node here.
8440 SDValue Opnd2;
8441 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts,
8442 HasAllowContract)) {
8443 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8444 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8445 }
8446
8447 // We only support ADDSUB.
8448 if (IsSubAdd)
8449 return SDValue();
8450
8451 // There are no known X86 targets with 512-bit ADDSUB instructions!
8452 // Convert to blend(fsub,fadd).
8453 if (VT.is512BitVector()) {
8454 SmallVector<int> Mask;
8455 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8456 Mask.push_back(I);
8457 Mask.push_back(I + E + 1);
8458 }
8459 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8460 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8461 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8462 }
8463
8464 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8465}
8466
8468 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8469 // Initialize outputs to known values.
8470 MVT VT = BV->getSimpleValueType(0);
8471 HOpcode = ISD::DELETED_NODE;
8472 V0 = DAG.getUNDEF(VT);
8473 V1 = DAG.getUNDEF(VT);
8474
8475 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8476 // half of the result is calculated independently from the 128-bit halves of
8477 // the inputs, so that makes the index-checking logic below more complicated.
8478 unsigned NumElts = VT.getVectorNumElements();
8479 unsigned GenericOpcode = ISD::DELETED_NODE;
8480 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8481 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8482 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8483 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8484 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8485 // Ignore undef elements.
8486 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8487 if (Op.isUndef())
8488 continue;
8489
8490 // If there's an opcode mismatch, we're done.
8491 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8492 return false;
8493
8494 // Initialize horizontal opcode.
8495 if (HOpcode == ISD::DELETED_NODE) {
8496 GenericOpcode = Op.getOpcode();
8497 switch (GenericOpcode) {
8498 // clang-format off
8499 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8500 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8501 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8502 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8503 default: return false;
8504 // clang-format on
8505 }
8506 }
8507
8508 SDValue Op0 = Op.getOperand(0);
8509 SDValue Op1 = Op.getOperand(1);
8510 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8512 Op0.getOperand(0) != Op1.getOperand(0) ||
8514 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8515 return false;
8516
8517 // The source vector is chosen based on which 64-bit half of the
8518 // destination vector is being calculated.
8519 if (j < NumEltsIn64Bits) {
8520 if (V0.isUndef())
8521 V0 = Op0.getOperand(0);
8522 } else {
8523 if (V1.isUndef())
8524 V1 = Op0.getOperand(0);
8525 }
8526
8527 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8528 if (SourceVec != Op0.getOperand(0))
8529 return false;
8530
8531 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8532 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8533 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8534 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8535 (j % NumEltsIn64Bits) * 2;
8536 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8537 continue;
8538
8539 // If this is not a commutative op, this does not match.
8540 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8541 return false;
8542
8543 // Addition is commutative, so try swapping the extract indexes.
8544 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8545 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8546 continue;
8547
8548 // Extract indexes do not match horizontal requirement.
8549 return false;
8550 }
8551 }
8552 // We matched. Opcode and operands are returned by reference as arguments.
8553 return true;
8554}
8555
8557 const SDLoc &DL, SelectionDAG &DAG,
8558 unsigned HOpcode, SDValue V0, SDValue V1) {
8559 // If either input vector is not the same size as the build vector,
8560 // extract/insert the low bits to the correct size.
8561 // This is free (examples: zmm --> xmm, xmm --> ymm).
8562 MVT VT = BV->getSimpleValueType(0);
8563 unsigned Width = VT.getSizeInBits();
8564 if (V0.getValueSizeInBits() > Width)
8565 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8566 else if (V0.getValueSizeInBits() < Width)
8567 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8568
8569 if (V1.getValueSizeInBits() > Width)
8570 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8571 else if (V1.getValueSizeInBits() < Width)
8572 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8573
8574 unsigned NumElts = VT.getVectorNumElements();
8575 APInt DemandedElts = APInt::getAllOnes(NumElts);
8576 for (unsigned i = 0; i != NumElts; ++i)
8577 if (BV->getOperand(i).isUndef())
8578 DemandedElts.clearBit(i);
8579
8580 // If we don't need the upper xmm, then perform as a xmm hop.
8581 unsigned HalfNumElts = NumElts / 2;
8582 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8583 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8584 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8585 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8586 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8587 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8588 }
8589
8590 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8591}
8592
8593/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8595 const X86Subtarget &Subtarget,
8596 SelectionDAG &DAG) {
8597 // We need at least 2 non-undef elements to make this worthwhile by default.
8598 unsigned NumNonUndefs =
8599 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8600 if (NumNonUndefs < 2)
8601 return SDValue();
8602
8603 // There are 4 sets of horizontal math operations distinguished by type:
8604 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8605 // subtarget feature. Try to match those "native" patterns first.
8606 MVT VT = BV->getSimpleValueType(0);
8607 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8608 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8609 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8610 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8611 unsigned HOpcode;
8612 SDValue V0, V1;
8613 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8614 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8615 }
8616
8617 // Try harder to match 256-bit ops by using extract/concat.
8618 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8619 return SDValue();
8620
8621 // Count the number of UNDEF operands in the build_vector in input.
8622 unsigned NumElts = VT.getVectorNumElements();
8623 unsigned Half = NumElts / 2;
8624 unsigned NumUndefsLO = 0;
8625 unsigned NumUndefsHI = 0;
8626 for (unsigned i = 0, e = Half; i != e; ++i)
8627 if (BV->getOperand(i)->isUndef())
8628 NumUndefsLO++;
8629
8630 for (unsigned i = Half, e = NumElts; i != e; ++i)
8631 if (BV->getOperand(i)->isUndef())
8632 NumUndefsHI++;
8633
8634 SDValue InVec0, InVec1;
8635 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8636 SDValue InVec2, InVec3;
8637 unsigned X86Opcode;
8638 bool CanFold = true;
8639
8640 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8641 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8642 InVec3) &&
8643 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8644 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8645 X86Opcode = X86ISD::HADD;
8646 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8647 InVec1) &&
8648 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8649 InVec3) &&
8650 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8651 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8652 X86Opcode = X86ISD::HSUB;
8653 else
8654 CanFold = false;
8655
8656 if (CanFold) {
8657 // Do not try to expand this build_vector into a pair of horizontal
8658 // add/sub if we can emit a pair of scalar add/sub.
8659 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8660 return SDValue();
8661
8662 // Convert this build_vector into a pair of horizontal binops followed by
8663 // a concat vector. We must adjust the outputs from the partial horizontal
8664 // matching calls above to account for undefined vector halves.
8665 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8666 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8667 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8668 bool isUndefLO = NumUndefsLO == Half;
8669 bool isUndefHI = NumUndefsHI == Half;
8670 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8671 isUndefHI);
8672 }
8673 }
8674
8675 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8676 VT == MVT::v16i16) {
8677 unsigned X86Opcode;
8678 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8679 InVec1))
8680 X86Opcode = X86ISD::HADD;
8681 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8682 InVec1))
8683 X86Opcode = X86ISD::HSUB;
8684 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8685 InVec1))
8686 X86Opcode = X86ISD::FHADD;
8687 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8688 InVec1))
8689 X86Opcode = X86ISD::FHSUB;
8690 else
8691 return SDValue();
8692
8693 // Don't try to expand this build_vector into a pair of horizontal add/sub
8694 // if we can simply emit a pair of scalar add/sub.
8695 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8696 return SDValue();
8697
8698 // Convert this build_vector into two horizontal add/sub followed by
8699 // a concat vector.
8700 bool isUndefLO = NumUndefsLO == Half;
8701 bool isUndefHI = NumUndefsHI == Half;
8702 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8703 isUndefLO, isUndefHI);
8704 }
8705
8706 return SDValue();
8707}
8708
8709static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8710 SelectionDAG &DAG);
8711
8712/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8713/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8714/// just apply the bit to the vectors.
8715/// NOTE: Its not in our interest to start make a general purpose vectorizer
8716/// from this, but enough scalar bit operations are created from the later
8717/// legalization + scalarization stages to need basic support.
8719 const X86Subtarget &Subtarget,
8720 SelectionDAG &DAG) {
8721 MVT VT = Op->getSimpleValueType(0);
8722 unsigned NumElems = VT.getVectorNumElements();
8723 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8724
8725 // Check that all elements have the same opcode.
8726 // TODO: Should we allow UNDEFS and if so how many?
8727 unsigned Opcode = Op->getOperand(0).getOpcode();
8728 for (unsigned i = 1; i < NumElems; ++i)
8729 if (Opcode != Op->getOperand(i).getOpcode())
8730 return SDValue();
8731
8732 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8733 bool IsShift = false;
8734 switch (Opcode) {
8735 default:
8736 return SDValue();
8737 case ISD::SHL:
8738 case ISD::SRL:
8739 case ISD::SRA:
8740 IsShift = true;
8741 break;
8742 case ISD::AND:
8743 case ISD::XOR:
8744 case ISD::OR:
8745 // Don't do this if the buildvector is a splat - we'd replace one
8746 // constant with an entire vector.
8747 if (Op->getSplatValue())
8748 return SDValue();
8749 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8750 return SDValue();
8751 break;
8752 }
8753
8754 SmallVector<SDValue, 4> LHSElts, RHSElts;
8755 for (SDValue Elt : Op->ops()) {
8756 SDValue LHS = Elt.getOperand(0);
8757 SDValue RHS = Elt.getOperand(1);
8758
8759 // We expect the canonicalized RHS operand to be the constant.
8761 return SDValue();
8762
8763 // Extend shift amounts.
8764 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8765 if (!IsShift)
8766 return SDValue();
8767 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8768 }
8769
8770 LHSElts.push_back(LHS);
8771 RHSElts.push_back(RHS);
8772 }
8773
8774 // Limit to shifts by uniform immediates.
8775 // TODO: Only accept vXi8/vXi64 special cases?
8776 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8777 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8778 return SDValue();
8779
8780 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8781 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8782 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8783
8784 if (!IsShift)
8785 return Res;
8786
8787 // Immediately lower the shift to ensure the constant build vector doesn't
8788 // get converted to a constant pool before the shift is lowered.
8789 return LowerShift(Res, Subtarget, DAG);
8790}
8791
8792static bool isShuffleFoldableLoad(SDValue);
8793
8794/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats
8795/// representing a blend.
8797 X86Subtarget const &Subtarget,
8798 SelectionDAG &DAG) {
8799 MVT VT = BVOp->getSimpleValueType(0u);
8800
8801 if (VT != MVT::v4f64)
8802 return SDValue();
8803
8804 // Collect unique operands.
8805 auto UniqueOps = SmallSet<SDValue, 16u>();
8806 for (SDValue Op : BVOp->ops()) {
8807 if (isIntOrFPConstant(Op) || Op.isUndef())
8808 return SDValue();
8809 UniqueOps.insert(Op);
8810 }
8811
8812 // Candidate BUILD_VECTOR must have 2 unique operands.
8813 if (UniqueOps.size() != 2u)
8814 return SDValue();
8815
8816 SDValue Op0 = BVOp->getOperand(0u);
8817 UniqueOps.erase(Op0);
8818 SDValue Op1 = *UniqueOps.begin();
8819
8820 if (Subtarget.hasAVX2() || isShuffleFoldableLoad(Op0) ||
8821 isShuffleFoldableLoad(Op1)) {
8822 // Create shuffle mask.
8823 auto const NumElems = VT.getVectorNumElements();
8824 SmallVector<int, 16u> Mask(NumElems);
8825 for (auto I = 0u; I < NumElems; ++I) {
8826 SDValue Op = BVOp->getOperand(I);
8827 Mask[I] = Op == Op0 ? I : I + NumElems;
8828 }
8829 // Create shuffle of splats.
8830 SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
8831 SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1);
8832 return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);
8833 }
8834
8835 return SDValue();
8836}
8837
8838/// Create a vector constant without a load. SSE/AVX provide the bare minimum
8839/// functionality to do this, so it's all zeros, all ones, or some derivation
8840/// that is cheap to calculate.
8842 SelectionDAG &DAG,
8843 const X86Subtarget &Subtarget) {
8844 MVT VT = Op.getSimpleValueType();
8845
8846 // Vectors containing all zeros can be matched by pxor and xorps.
8847 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8848 return Op;
8849
8850 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8851 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8852 // vpcmpeqd on 256-bit vectors.
8853 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8854 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8855 return Op;
8856
8857 return getOnesVector(VT, DAG, DL);
8858 }
8859
8860 return SDValue();
8861}
8862
8863/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8864/// from a vector of source values and a vector of extraction indices.
8865/// The vectors might be manipulated to match the type of the permute op.
8866static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8867 const SDLoc &DL, SelectionDAG &DAG,
8868 const X86Subtarget &Subtarget) {
8869 MVT ShuffleVT = VT;
8870 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8871 unsigned NumElts = VT.getVectorNumElements();
8872 unsigned SizeInBits = VT.getSizeInBits();
8873
8874 // Adjust IndicesVec to match VT size.
8875 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8876 "Illegal variable permute mask size");
8877 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8878 // Narrow/widen the indices vector to the correct size.
8879 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8880 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8881 NumElts * VT.getScalarSizeInBits());
8882 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8883 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8884 SDLoc(IndicesVec), SizeInBits);
8885 // Zero-extend the index elements within the vector.
8886 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8887 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8888 IndicesVT, IndicesVec);
8889 }
8890 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8891
8892 // Handle SrcVec that don't match VT type.
8893 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8894 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8895 // Handle larger SrcVec by treating it as a larger permute.
8896 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8897 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8898 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8899 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8900 Subtarget, DAG, SDLoc(IndicesVec));
8901 SDValue NewSrcVec =
8902 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8903 if (NewSrcVec)
8904 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8905 return SDValue();
8906 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8907 // Widen smaller SrcVec to match VT.
8908 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8909 } else
8910 return SDValue();
8911 }
8912
8913 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8914 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8915 EVT SrcVT = Idx.getValueType();
8916 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8917 uint64_t IndexScale = 0;
8918 uint64_t IndexOffset = 0;
8919
8920 // If we're scaling a smaller permute op, then we need to repeat the
8921 // indices, scaling and offsetting them as well.
8922 // e.g. v4i32 -> v16i8 (Scale = 4)
8923 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8924 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8925 for (uint64_t i = 0; i != Scale; ++i) {
8926 IndexScale |= Scale << (i * NumDstBits);
8927 IndexOffset |= i << (i * NumDstBits);
8928 }
8929
8930 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8931 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8932 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8933 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8934 return Idx;
8935 };
8936
8937 unsigned Opcode = 0;
8938 switch (VT.SimpleTy) {
8939 default:
8940 break;
8941 case MVT::v16i8:
8942 if (Subtarget.hasSSSE3())
8943 Opcode = X86ISD::PSHUFB;
8944 break;
8945 case MVT::v8i16:
8946 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8947 Opcode = X86ISD::VPERMV;
8948 else if (Subtarget.hasSSSE3()) {
8949 Opcode = X86ISD::PSHUFB;
8950 ShuffleVT = MVT::v16i8;
8951 }
8952 break;
8953 case MVT::v4f32:
8954 case MVT::v4i32:
8955 if (Subtarget.hasAVX()) {
8956 Opcode = X86ISD::VPERMILPV;
8957 ShuffleVT = MVT::v4f32;
8958 } else if (Subtarget.hasSSSE3()) {
8959 Opcode = X86ISD::PSHUFB;
8960 ShuffleVT = MVT::v16i8;
8961 }
8962 break;
8963 case MVT::v2f64:
8964 case MVT::v2i64:
8965 if (Subtarget.hasAVX()) {
8966 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8967 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8968 Opcode = X86ISD::VPERMILPV;
8969 ShuffleVT = MVT::v2f64;
8970 } else if (Subtarget.hasSSE41()) {
8971 // SSE41 can compare v2i64 - select between indices 0 and 1.
8972 return DAG.getSelectCC(
8973 DL, IndicesVec,
8974 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8975 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8976 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8978 }
8979 break;
8980 case MVT::v32i8:
8981 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8982 Opcode = X86ISD::VPERMV;
8983 else if (Subtarget.hasXOP()) {
8984 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8985 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8986 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8987 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8988 return DAG.getNode(
8990 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
8991 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
8992 } else if (Subtarget.hasAVX()) {
8993 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
8994 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
8995 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
8996 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
8997 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
8999 // Permute Lo and Hi and then select based on index range.
9000 // This works as SHUFB uses bits[3:0] to permute elements and we don't
9001 // care about the bit[7] as its just an index vector.
9002 SDValue Idx = Ops[2];
9003 EVT VT = Idx.getValueType();
9004 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
9005 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
9006 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
9008 };
9009 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
9010 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
9011 PSHUFBBuilder);
9012 }
9013 break;
9014 case MVT::v16i16:
9015 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9016 Opcode = X86ISD::VPERMV;
9017 else if (Subtarget.hasAVX()) {
9018 // Scale to v32i8 and perform as v32i8.
9019 IndicesVec = ScaleIndices(IndicesVec, 2);
9020 return DAG.getBitcast(
9022 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
9023 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
9024 }
9025 break;
9026 case MVT::v8f32:
9027 case MVT::v8i32:
9028 if (Subtarget.hasAVX2())
9029 Opcode = X86ISD::VPERMV;
9030 else if (Subtarget.hasAVX()) {
9031 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
9032 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9033 {0, 1, 2, 3, 0, 1, 2, 3});
9034 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9035 {4, 5, 6, 7, 4, 5, 6, 7});
9036 if (Subtarget.hasXOP())
9037 return DAG.getBitcast(
9038 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
9039 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9040 // Permute Lo and Hi and then select based on index range.
9041 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
9042 SDValue Res = DAG.getSelectCC(
9043 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
9044 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
9045 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
9047 return DAG.getBitcast(VT, Res);
9048 }
9049 break;
9050 case MVT::v4i64:
9051 case MVT::v4f64:
9052 if (Subtarget.hasAVX512()) {
9053 if (!Subtarget.hasVLX()) {
9054 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
9055 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
9056 SDLoc(SrcVec));
9057 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
9058 DAG, SDLoc(IndicesVec));
9059 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
9060 DAG, Subtarget);
9061 return extract256BitVector(Res, 0, DAG, DL);
9062 }
9063 Opcode = X86ISD::VPERMV;
9064 } else if (Subtarget.hasAVX()) {
9065 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
9066 SDValue LoLo =
9067 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
9068 SDValue HiHi =
9069 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
9070 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
9071 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9072 if (Subtarget.hasXOP())
9073 return DAG.getBitcast(
9074 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
9075 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9076 // Permute Lo and Hi and then select based on index range.
9077 // This works as VPERMILPD only uses index bit[1] to permute elements.
9078 SDValue Res = DAG.getSelectCC(
9079 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
9080 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
9081 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
9083 return DAG.getBitcast(VT, Res);
9084 }
9085 break;
9086 case MVT::v64i8:
9087 if (Subtarget.hasVBMI())
9088 Opcode = X86ISD::VPERMV;
9089 break;
9090 case MVT::v32i16:
9091 if (Subtarget.hasBWI())
9092 Opcode = X86ISD::VPERMV;
9093 break;
9094 case MVT::v16f32:
9095 case MVT::v16i32:
9096 case MVT::v8f64:
9097 case MVT::v8i64:
9098 if (Subtarget.hasAVX512())
9099 Opcode = X86ISD::VPERMV;
9100 break;
9101 }
9102 if (!Opcode)
9103 return SDValue();
9104
9105 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
9106 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
9107 "Illegal variable permute shuffle type");
9108
9109 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
9110 if (Scale > 1)
9111 IndicesVec = ScaleIndices(IndicesVec, Scale);
9112
9113 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
9114 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
9115
9116 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
9117 SDValue Res = Opcode == X86ISD::VPERMV
9118 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
9119 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
9120 return DAG.getBitcast(VT, Res);
9121}
9122
9123// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
9124// reasoned to be a permutation of a vector by indices in a non-constant vector.
9125// (build_vector (extract_elt V, (extract_elt I, 0)),
9126// (extract_elt V, (extract_elt I, 1)),
9127// ...
9128// ->
9129// (vpermv I, V)
9130//
9131// TODO: Handle undefs
9132// TODO: Utilize pshufb and zero mask blending to support more efficient
9133// construction of vectors with constant-0 elements.
9134static SDValue
9136 SelectionDAG &DAG,
9137 const X86Subtarget &Subtarget) {
9138 SDValue SrcVec, IndicesVec;
9139
9140 auto PeekThroughFreeze = [](SDValue N) {
9141 if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())
9142 return N->getOperand(0);
9143 return N;
9144 };
9145 // Check for a match of the permute source vector and permute index elements.
9146 // This is done by checking that the i-th build_vector operand is of the form:
9147 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
9148 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
9149 SDValue Op = PeekThroughFreeze(V.getOperand(Idx));
9150 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9151 return SDValue();
9152
9153 // If this is the first extract encountered in V, set the source vector,
9154 // otherwise verify the extract is from the previously defined source
9155 // vector.
9156 if (!SrcVec)
9157 SrcVec = Op.getOperand(0);
9158 else if (SrcVec != Op.getOperand(0))
9159 return SDValue();
9160 SDValue ExtractedIndex = Op->getOperand(1);
9161 // Peek through extends.
9162 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
9163 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
9164 ExtractedIndex = ExtractedIndex.getOperand(0);
9165 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9166 return SDValue();
9167
9168 // If this is the first extract from the index vector candidate, set the
9169 // indices vector, otherwise verify the extract is from the previously
9170 // defined indices vector.
9171 if (!IndicesVec)
9172 IndicesVec = ExtractedIndex.getOperand(0);
9173 else if (IndicesVec != ExtractedIndex.getOperand(0))
9174 return SDValue();
9175
9176 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
9177 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9178 return SDValue();
9179 }
9180
9181 MVT VT = V.getSimpleValueType();
9182 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9183}
9184
9185SDValue
9186X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
9187 SDLoc dl(Op);
9188
9189 MVT VT = Op.getSimpleValueType();
9190 MVT EltVT = VT.getVectorElementType();
9191 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
9192 unsigned NumElems = Op.getNumOperands();
9193
9194 // Generate vectors for predicate vectors.
9195 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9196 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
9197
9198 if (VT.getVectorElementType() == MVT::bf16 &&
9199 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9200 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
9201
9202 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
9203 return VectorCst;
9204
9205 unsigned EVTBits = EltVT.getSizeInBits();
9206 APInt UndefMask = APInt::getZero(NumElems);
9207 APInt FrozenUndefMask = APInt::getZero(NumElems);
9208 APInt ZeroMask = APInt::getZero(NumElems);
9209 APInt NonZeroMask = APInt::getZero(NumElems);
9210 bool IsAllConstants = true;
9211 bool OneUseFrozenUndefs = true;
9212 SmallSet<SDValue, 8> Values;
9213 unsigned NumConstants = NumElems;
9214 for (unsigned i = 0; i < NumElems; ++i) {
9215 SDValue Elt = Op.getOperand(i);
9216 if (Elt.isUndef()) {
9217 UndefMask.setBit(i);
9218 continue;
9219 }
9220 if (ISD::isFreezeUndef(Elt.getNode())) {
9221 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9222 FrozenUndefMask.setBit(i);
9223 continue;
9224 }
9225 Values.insert(Elt);
9226 if (!isIntOrFPConstant(Elt)) {
9227 IsAllConstants = false;
9228 NumConstants--;
9229 }
9230 if (X86::isZeroNode(Elt)) {
9231 ZeroMask.setBit(i);
9232 } else {
9233 NonZeroMask.setBit(i);
9234 }
9235 }
9236
9237 // All undef vector. Return an UNDEF.
9238 if (UndefMask.isAllOnes())
9239 return DAG.getUNDEF(VT);
9240
9241 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
9242 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9243 return DAG.getFreeze(DAG.getUNDEF(VT));
9244
9245 // All undef/freeze(undef)/zero vector. Return a zero vector.
9246 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9247 return getZeroVector(VT, Subtarget, DAG, dl);
9248
9249 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9250 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
9251 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9252 // and blend the FREEZE-UNDEF operands back in.
9253 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9254 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
9255 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9256 SmallVector<int, 16> BlendMask(NumElems, -1);
9257 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
9258 for (unsigned i = 0; i < NumElems; ++i) {
9259 if (UndefMask[i]) {
9260 BlendMask[i] = -1;
9261 continue;
9262 }
9263 BlendMask[i] = i;
9264 if (!FrozenUndefMask[i])
9265 Elts[i] = Op.getOperand(i);
9266 else
9267 BlendMask[i] += NumElems;
9268 }
9269 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
9270 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
9271 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
9272 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
9273 }
9274
9275 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9276
9277 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
9278 // be better off lowering to a smaller build vector and padding with
9279 // undef/zero.
9280 if ((VT.is256BitVector() || VT.is512BitVector()) &&
9282 unsigned UpperElems = NumElems / 2;
9283 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9284 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
9285 if (NumUpperUndefsOrZeros >= UpperElems) {
9286 if (VT.is512BitVector() &&
9287 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9288 UpperElems = NumElems - (NumElems / 4);
9289 // If freeze(undef) is in any upper elements, force to zero.
9290 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
9291 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9292 SDValue NewBV =
9293 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9294 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9295 }
9296 }
9297
9298 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
9299 return AddSub;
9300 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
9301 return HorizontalOp;
9302 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
9303 return Broadcast;
9304 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
9305 return BitOp;
9306 if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG))
9307 return Blend;
9308
9309 unsigned NumZero = ZeroMask.popcount();
9310 unsigned NumNonZero = NonZeroMask.popcount();
9311
9312 // If we are inserting one variable into a vector of non-zero constants, try
9313 // to avoid loading each constant element as a scalar. Load the constants as a
9314 // vector and then insert the variable scalar element. If insertion is not
9315 // supported, fall back to a shuffle to get the scalar blended with the
9316 // constants. Insertion into a zero vector is handled as a special-case
9317 // somewhere below here.
9318 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9319 FrozenUndefMask.isZero() &&
9322 // Create an all-constant vector. The variable element in the old
9323 // build vector is replaced by undef in the constant vector. Save the
9324 // variable scalar element and its index for use in the insertelement.
9325 LLVMContext &Context = *DAG.getContext();
9326 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9327 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9328 SDValue VarElt;
9329 SDValue InsIndex;
9330 for (unsigned i = 0; i != NumElems; ++i) {
9331 SDValue Elt = Op.getOperand(i);
9332 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9333 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9334 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9335 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9336 else if (!Elt.isUndef()) {
9337 assert(!VarElt.getNode() && !InsIndex.getNode() &&
9338 "Expected one variable element in this vector");
9339 VarElt = Elt;
9340 InsIndex = DAG.getVectorIdxConstant(i, dl);
9341 }
9342 }
9343 Constant *CV = ConstantVector::get(ConstVecOps);
9344 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9345
9346 // The constants we just created may not be legal (eg, floating point). We
9347 // must lower the vector right here because we can not guarantee that we'll
9348 // legalize it before loading it. This is also why we could not just create
9349 // a new build vector here. If the build vector contains illegal constants,
9350 // it could get split back up into a series of insert elements.
9351 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9352 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9353 MachineFunction &MF = DAG.getMachineFunction();
9354 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
9355 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9356 unsigned InsertC = InsIndex->getAsZExtVal();
9357 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9358 if (InsertC < NumEltsInLow128Bits)
9359 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9360
9361 // There's no good way to insert into the high elements of a >128-bit
9362 // vector, so use shuffles to avoid an extract/insert sequence.
9363 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
9364 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9365 SmallVector<int, 8> ShuffleMask;
9366 unsigned NumElts = VT.getVectorNumElements();
9367 for (unsigned i = 0; i != NumElts; ++i)
9368 ShuffleMask.push_back(i == InsertC ? NumElts : i);
9369 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
9370 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
9371 }
9372
9373 // Special case for single non-zero, non-undef, element.
9374 if (NumNonZero == 1) {
9375 unsigned Idx = NonZeroMask.countr_zero();
9376 SDValue Item = Op.getOperand(Idx);
9377
9378 // If we have a constant or non-constant insertion into the low element of
9379 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
9380 // the rest of the elements. This will be matched as movd/movq/movss/movsd
9381 // depending on what the source datatype is.
9382 if (Idx == 0) {
9383 if (NumZero == 0)
9384 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9385
9386 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9387 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9388 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9389 assert((VT.is128BitVector() || VT.is256BitVector() ||
9390 VT.is512BitVector()) &&
9391 "Expected an SSE value type!");
9392 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9393 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
9394 // zero vector.
9395 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9396 }
9397
9398 // We can't directly insert an i8 or i16 into a vector, so zero extend
9399 // it to i32 first.
9400 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9401 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9402 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
9403 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9404 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9405 return DAG.getBitcast(VT, Item);
9406 }
9407 }
9408
9409 // Is it a vector logical left shift?
9410 if (NumElems == 2 && Idx == 1 &&
9411 X86::isZeroNode(Op.getOperand(0)) &&
9412 !X86::isZeroNode(Op.getOperand(1))) {
9413 unsigned NumBits = VT.getSizeInBits();
9414 return getVShift(true, VT,
9416 VT, Op.getOperand(1)),
9417 NumBits/2, DAG, *this, dl);
9418 }
9419
9420 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9421 return SDValue();
9422
9423 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9424 // is a non-constant being inserted into an element other than the low one,
9425 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9426 // movd/movss) to move this into the low element, then shuffle it into
9427 // place.
9428 if (EVTBits == 32) {
9429 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9430 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9431 }
9432 }
9433
9434 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9435 if (Values.size() == 1) {
9436 if (EVTBits == 32) {
9437 // Instead of a shuffle like this:
9438 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9439 // Check if it's possible to issue this instead.
9440 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9441 unsigned Idx = NonZeroMask.countr_zero();
9442 SDValue Item = Op.getOperand(Idx);
9443 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9444 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9445 }
9446 return SDValue();
9447 }
9448
9449 // A vector full of immediates; various special cases are already
9450 // handled, so this is best done with a single constant-pool load.
9451 if (IsAllConstants)
9452 return SDValue();
9453
9454 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9455 return V;
9456
9457 // See if we can use a vector load to get all of the elements.
9458 {
9459 SmallVector<SDValue, 64> Ops(Op->ops().take_front(NumElems));
9460 if (SDValue LD =
9461 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9462 return LD;
9463 }
9464
9465 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9466 // build_vector and broadcast it.
9467 // TODO: We could probably generalize this more.
9468 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9469 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9470 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9471 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9472 // Make sure all the even/odd operands match.
9473 for (unsigned i = 2; i != NumElems; ++i)
9474 if (Ops[i % 2] != Op.getOperand(i))
9475 return false;
9476 return true;
9477 };
9478 if (CanSplat(Op, NumElems, Ops)) {
9479 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9480 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9481 // Create a new build vector and cast to v2i64/v2f64.
9482 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9483 DAG.getBuildVector(NarrowVT, dl, Ops));
9484 // Broadcast from v2i64/v2f64 and cast to final VT.
9485 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9486 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9487 NewBV));
9488 }
9489 }
9490
9491 // For AVX-length vectors, build the individual 128-bit pieces and use
9492 // shuffles to put them in place.
9493 if (VT.getSizeInBits() > 128) {
9494 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9495
9496 // Build both the lower and upper subvector.
9497 SDValue Lower =
9498 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9500 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9501
9502 // Recreate the wider vector with the lower and upper part.
9503 return concatSubVectors(Lower, Upper, DAG, dl);
9504 }
9505
9506 // Let legalizer expand 2-wide build_vectors.
9507 if (EVTBits == 64) {
9508 if (NumNonZero == 1) {
9509 // One half is zero or undef.
9510 unsigned Idx = NonZeroMask.countr_zero();
9511 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
9512 Op.getOperand(Idx));
9513 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9514 }
9515 return SDValue();
9516 }
9517
9518 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9519 if (EVTBits == 8 && NumElems == 16)
9520 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9521 NumZero, DAG, Subtarget))
9522 return V;
9523
9524 if (EltVT == MVT::i16 && NumElems == 8)
9525 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9526 NumZero, DAG, Subtarget))
9527 return V;
9528
9529 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9530 if (EVTBits == 32 && NumElems == 4)
9531 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9532 return V;
9533
9534 // If element VT is == 32 bits, turn it into a number of shuffles.
9535 if (NumElems == 4 && NumZero > 0) {
9536 SmallVector<SDValue, 8> Ops(NumElems);
9537 for (unsigned i = 0; i < 4; ++i) {
9538 bool isZero = !NonZeroMask[i];
9539 if (isZero)
9540 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9541 else
9542 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9543 }
9544
9545 for (unsigned i = 0; i < 2; ++i) {
9546 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9547 default: llvm_unreachable("Unexpected NonZero count");
9548 case 0:
9549 Ops[i] = Ops[i*2]; // Must be a zero vector.
9550 break;
9551 case 1:
9552 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9553 break;
9554 case 2:
9555 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9556 break;
9557 case 3:
9558 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9559 break;
9560 }
9561 }
9562
9563 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9564 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9565 int MaskVec[] = {
9566 Reverse1 ? 1 : 0,
9567 Reverse1 ? 0 : 1,
9568 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9569 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9570 };
9571 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9572 }
9573
9574 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9575
9576 // Check for a build vector from mostly shuffle plus few inserting.
9577 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9578 return Sh;
9579
9580 // For SSE 4.1, use insertps to put the high elements into the low element.
9581 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9583 if (!Op.getOperand(0).isUndef())
9584 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9585 else
9586 Result = DAG.getUNDEF(VT);
9587
9588 for (unsigned i = 1; i < NumElems; ++i) {
9589 if (Op.getOperand(i).isUndef()) continue;
9590 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9591 Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
9592 }
9593 return Result;
9594 }
9595
9596 // Otherwise, expand into a number of unpckl*, start by extending each of
9597 // our (non-undef) elements to the full vector width with the element in the
9598 // bottom slot of the vector (which generates no code for SSE).
9599 SmallVector<SDValue, 8> Ops(NumElems);
9600 for (unsigned i = 0; i < NumElems; ++i) {
9601 if (!Op.getOperand(i).isUndef())
9602 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9603 else
9604 Ops[i] = DAG.getUNDEF(VT);
9605 }
9606
9607 // Next, we iteratively mix elements, e.g. for v4f32:
9608 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9609 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9610 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9611 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9612 // Generate scaled UNPCKL shuffle mask.
9613 SmallVector<int, 16> Mask;
9614 for(unsigned i = 0; i != Scale; ++i)
9615 Mask.push_back(i);
9616 for (unsigned i = 0; i != Scale; ++i)
9617 Mask.push_back(NumElems+i);
9618 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9619
9620 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9621 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9622 }
9623 return Ops[0];
9624}
9625
9626// 256-bit AVX can use the vinsertf128 instruction
9627// to create 256-bit vectors from two other 128-bit ones.
9628// TODO: Detect subvector broadcast here instead of DAG combine?
9630 SelectionDAG &DAG,
9631 const X86Subtarget &Subtarget) {
9632 MVT ResVT = Op.getSimpleValueType();
9633 assert((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
9634 "Value type must be 256-/512-bit wide");
9635
9636 unsigned NumOperands = Op.getNumOperands();
9637 unsigned NumFreezeUndef = 0;
9638 unsigned NumZero = 0;
9639 unsigned NumNonZero = 0;
9640 unsigned NonZeros = 0;
9641 SmallSet<SDValue, 4> Undefs;
9642 for (unsigned i = 0; i != NumOperands; ++i) {
9643 SDValue SubVec = Op.getOperand(i);
9644 if (SubVec.isUndef())
9645 continue;
9646 if (ISD::isFreezeUndef(SubVec.getNode())) {
9647 // If the freeze(undef) has multiple uses then we must fold to zero.
9648 if (SubVec.hasOneUse()) {
9649 ++NumFreezeUndef;
9650 } else {
9651 ++NumZero;
9652 Undefs.insert(SubVec);
9653 }
9654 }
9655 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9656 ++NumZero;
9657 else {
9658 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9659 NonZeros |= 1 << i;
9660 ++NumNonZero;
9661 }
9662 }
9663
9664 // If we have more than 2 non-zeros, build each half separately.
9665 if (NumNonZero > 2) {
9666 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9667 ArrayRef<SDUse> Ops = Op->ops();
9668 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9669 Ops.slice(0, NumOperands/2));
9670 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9671 Ops.slice(NumOperands/2));
9672 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9673 }
9674
9675 // Otherwise, build it up through insert_subvectors.
9676 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9677 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9678 : DAG.getUNDEF(ResVT));
9679
9680 // Replace Undef operands with ZeroVector.
9681 for (SDValue U : Undefs)
9683 U, getZeroVector(U.getSimpleValueType(), Subtarget, DAG, dl));
9684
9685 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9686 unsigned NumSubElems = SubVT.getVectorNumElements();
9687 for (unsigned i = 0; i != NumOperands; ++i) {
9688 if ((NonZeros & (1 << i)) == 0)
9689 continue;
9690
9691 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i),
9692 DAG.getVectorIdxConstant(i * NumSubElems, dl));
9693 }
9694
9695 return Vec;
9696}
9697
9698// Returns true if the given node is a type promotion (by concatenating i1
9699// zeros) of the result of a node that already zeros all upper bits of
9700// k-register.
9701// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9703 const X86Subtarget &Subtarget,
9704 SelectionDAG & DAG) {
9705 MVT ResVT = Op.getSimpleValueType();
9706 unsigned NumOperands = Op.getNumOperands();
9707 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9708 "Unexpected number of operands in CONCAT_VECTORS");
9709
9710 uint64_t Zeros = 0;
9711 uint64_t NonZeros = 0;
9712 for (unsigned i = 0; i != NumOperands; ++i) {
9713 SDValue SubVec = Op.getOperand(i);
9714 if (SubVec.isUndef())
9715 continue;
9716 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9717 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9718 Zeros |= (uint64_t)1 << i;
9719 else
9720 NonZeros |= (uint64_t)1 << i;
9721 }
9722
9723 unsigned NumElems = ResVT.getVectorNumElements();
9724
9725 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9726 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9727 // insert_subvector will give us two kshifts.
9728 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9729 Log2_64(NonZeros) != NumOperands - 1) {
9730 unsigned Idx = Log2_64(NonZeros);
9731 SDValue SubVec = Op.getOperand(Idx);
9732 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9733 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9734 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9735 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9736 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9737 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9738 DAG.getVectorIdxConstant(0, dl));
9739 }
9740
9741 // If there are zero or one non-zeros we can handle this very simply.
9742 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9743 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9744 if (!NonZeros)
9745 return Vec;
9746 unsigned Idx = Log2_64(NonZeros);
9747 SDValue SubVec = Op.getOperand(Idx);
9748 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9749 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9750 DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl));
9751 }
9752
9753 if (NumOperands > 2) {
9754 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9755 ArrayRef<SDUse> Ops = Op->ops();
9756 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9757 Ops.slice(0, NumOperands / 2));
9758 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9759 Ops.slice(NumOperands / 2));
9760 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9761 }
9762
9763 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9764
9765 if (ResVT.getVectorNumElements() >= 16)
9766 return Op; // The operation is legal with KUNPCK
9767
9768 SDValue Vec =
9769 DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT),
9770 Op.getOperand(0), DAG.getVectorIdxConstant(0, dl));
9771 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9772 DAG.getVectorIdxConstant(NumElems / 2, dl));
9773}
9774
9776 const X86Subtarget &Subtarget,
9777 SelectionDAG &DAG) {
9778 SDLoc DL(Op);
9779 MVT VT = Op.getSimpleValueType();
9780 if (VT.getVectorElementType() == MVT::i1)
9781 return LowerCONCAT_VECTORSvXi1(Op, DL, Subtarget, DAG);
9782
9783 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9784 // from two other 128-bit ones.
9785 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9786 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9787 (VT.is512BitVector() &&
9788 (Op.getNumOperands() == 2 || Op.getNumOperands() == 4)));
9789 return LowerAVXCONCAT_VECTORS(Op, DL, DAG, Subtarget);
9790}
9791
9792//===----------------------------------------------------------------------===//
9793// Vector shuffle lowering
9794//
9795// This is an experimental code path for lowering vector shuffles on x86. It is
9796// designed to handle arbitrary vector shuffles and blends, gracefully
9797// degrading performance as necessary. It works hard to recognize idiomatic
9798// shuffles and lower them to optimal instruction patterns without leaving
9799// a framework that allows reasonably efficient handling of all vector shuffle
9800// patterns.
9801//===----------------------------------------------------------------------===//
9802
9803/// Checks whether the vector elements referenced by two shuffle masks are
9804/// equivalent.
9805static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9806 int Idx, int ExpectedIdx) {
9807 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9808 ExpectedIdx < MaskSize && "Out of range element index");
9809 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9810 return false;
9811
9812 EVT VT = Op.getValueType();
9813 EVT ExpectedVT = ExpectedOp.getValueType();
9814
9815 // Sources must be vectors and match the mask's element count.
9816 if (!VT.isVector() || !ExpectedVT.isVector() ||
9817 (int)VT.getVectorNumElements() != MaskSize ||
9818 (int)ExpectedVT.getVectorNumElements() != MaskSize)
9819 return false;
9820
9821 // Exact match.
9822 if (Idx == ExpectedIdx && Op == ExpectedOp)
9823 return true;
9824
9825 switch (Op.getOpcode()) {
9826 case ISD::BUILD_VECTOR:
9827 // If the values are build vectors, we can look through them to find
9828 // equivalent inputs that make the shuffles equivalent.
9829 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9830 case ISD::BITCAST: {
9832 EVT SrcVT = Src.getValueType();
9833 if (Op == ExpectedOp && SrcVT.isVector()) {
9834 if ((SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
9835 unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
9836 return (Idx % Scale) == (ExpectedIdx % Scale) &&
9837 IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9838 Idx / Scale, ExpectedIdx / Scale);
9839 }
9840 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0) {
9841 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
9842 for (unsigned I = 0; I != Scale; ++I)
9843 if (!IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9844 (Idx * Scale) + I,
9845 (ExpectedIdx * Scale) + I))
9846 return false;
9847 return true;
9848 }
9849 }
9850 break;
9851 }
9852 case ISD::VECTOR_SHUFFLE: {
9853 auto *SVN = cast<ShuffleVectorSDNode>(Op);
9854 return Op == ExpectedOp &&
9855 SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
9856 }
9857 case X86ISD::VBROADCAST:
9859 return Op == ExpectedOp;
9861 if (Op == ExpectedOp) {
9862 auto *MemOp = cast<MemSDNode>(Op);
9863 unsigned NumMemElts = MemOp->getMemoryVT().getVectorNumElements();
9864 return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts);
9865 }
9866 break;
9867 case X86ISD::VPERMI: {
9868 if (Op == ExpectedOp) {
9870 DecodeVPERMMask(MaskSize, Op.getConstantOperandVal(1), Mask);
9871 SDValue Src = Op.getOperand(0);
9872 return IsElementEquivalent(MaskSize, Src, Src, Mask[Idx],
9873 Mask[ExpectedIdx]);
9874 }
9875 break;
9876 }
9877 case X86ISD::HADD:
9878 case X86ISD::HSUB:
9879 case X86ISD::FHADD:
9880 case X86ISD::FHSUB:
9881 case X86ISD::PACKSS:
9882 case X86ISD::PACKUS:
9883 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9884 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9885 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9886 int NumElts = VT.getVectorNumElements();
9887 int NumLanes = VT.getSizeInBits() / 128;
9888 int NumEltsPerLane = NumElts / NumLanes;
9889 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9890 bool SameLane = (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9891 bool SameElt =
9892 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9893 return SameLane && SameElt;
9894 }
9895 break;
9896 }
9897
9898 return false;
9899}
9900
9901/// Tiny helper function to identify a no-op mask.
9902///
9903/// This is a somewhat boring predicate function. It checks whether the mask
9904/// array input, which is assumed to be a single-input shuffle mask of the kind
9905/// used by the X86 shuffle instructions (not a fully general
9906/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9907/// in-place shuffle are 'no-op's.
9909 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9910 assert(Mask[i] >= -1 && "Out of bound mask element!");
9911 if (Mask[i] >= 0 && Mask[i] != i)
9912 return false;
9913 }
9914 return true;
9915}
9916
9917/// Test whether there are elements crossing LaneSizeInBits lanes in this
9918/// shuffle mask.
9919///
9920/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9921/// and we routinely test for these.
9922static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9923 unsigned ScalarSizeInBits,
9924 ArrayRef<int> Mask) {
9925 assert(LaneSizeInBits && ScalarSizeInBits &&
9926 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9927 "Illegal shuffle lane size");
9928 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9929 int Size = Mask.size();
9930 for (int i = 0; i < Size; ++i)
9931 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9932 return true;
9933 return false;
9934}
9935
9936/// Test whether there are elements crossing 128-bit lanes in this
9937/// shuffle mask.
9939 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9940}
9941
9942/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9943/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9944/// better support 'repeated mask + lane permute' style shuffles.
9945static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9946 unsigned ScalarSizeInBits,
9947 ArrayRef<int> Mask) {
9948 assert(LaneSizeInBits && ScalarSizeInBits &&
9949 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9950 "Illegal shuffle lane size");
9951 int NumElts = Mask.size();
9952 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9953 int NumLanes = NumElts / NumEltsPerLane;
9954 if (NumLanes > 1) {
9955 for (int i = 0; i != NumLanes; ++i) {
9956 int SrcLane = -1;
9957 for (int j = 0; j != NumEltsPerLane; ++j) {
9958 int M = Mask[(i * NumEltsPerLane) + j];
9959 if (M < 0)
9960 continue;
9961 int Lane = (M % NumElts) / NumEltsPerLane;
9962 if (SrcLane >= 0 && SrcLane != Lane)
9963 return true;
9964 SrcLane = Lane;
9965 }
9966 }
9967 }
9968 return false;
9969}
9970
9971/// Test whether a shuffle mask is equivalent within each sub-lane.
9972///
9973/// This checks a shuffle mask to see if it is performing the same
9974/// lane-relative shuffle in each sub-lane. This trivially implies
9975/// that it is also not lane-crossing. It may however involve a blend from the
9976/// same lane of a second vector.
9977///
9978/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9979/// non-trivial to compute in the face of undef lanes. The representation is
9980/// suitable for use with existing 128-bit shuffles as entries from the second
9981/// vector have been remapped to [LaneSize, 2*LaneSize).
9982static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9983 ArrayRef<int> Mask,
9984 SmallVectorImpl<int> &RepeatedMask) {
9985 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
9986 RepeatedMask.assign(LaneSize, -1);
9987 int Size = Mask.size();
9988 for (int i = 0; i < Size; ++i) {
9989 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
9990 if (Mask[i] < 0)
9991 continue;
9992 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9993 // This entry crosses lanes, so there is no way to model this shuffle.
9994 return false;
9995
9996 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
9997 // Adjust second vector indices to start at LaneSize instead of Size.
9998 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
9999 : Mask[i] % LaneSize + LaneSize;
10000 if (RepeatedMask[i % LaneSize] < 0)
10001 // This is the first non-undef entry in this slot of a 128-bit lane.
10002 RepeatedMask[i % LaneSize] = LocalM;
10003 else if (RepeatedMask[i % LaneSize] != LocalM)
10004 // Found a mismatch with the repeated mask.
10005 return false;
10006 }
10007 return true;
10008}
10009
10010/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10011static bool
10013 SmallVectorImpl<int> &RepeatedMask) {
10014 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10015}
10016
10017static bool
10019 SmallVector<int, 32> RepeatedMask;
10020 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10021}
10022
10023/// Test whether a shuffle mask is equivalent within each 256-bit lane.
10024static bool
10026 SmallVectorImpl<int> &RepeatedMask) {
10027 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
10028}
10029
10030/// Test whether a target shuffle mask is equivalent within each sub-lane.
10031/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10032static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
10033 unsigned EltSizeInBits,
10034 ArrayRef<int> Mask,
10035 SmallVectorImpl<int> &RepeatedMask) {
10036 int LaneSize = LaneSizeInBits / EltSizeInBits;
10037 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
10038 int Size = Mask.size();
10039 for (int i = 0; i < Size; ++i) {
10040 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
10041 if (Mask[i] == SM_SentinelUndef)
10042 continue;
10043 if (Mask[i] == SM_SentinelZero) {
10044 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
10045 return false;
10046 RepeatedMask[i % LaneSize] = SM_SentinelZero;
10047 continue;
10048 }
10049 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10050 // This entry crosses lanes, so there is no way to model this shuffle.
10051 return false;
10052
10053 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
10054 // later vector indices to start at multiples of LaneSize instead of Size.
10055 int LaneM = Mask[i] / Size;
10056 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
10057 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
10058 // This is the first non-undef entry in this slot of a 128-bit lane.
10059 RepeatedMask[i % LaneSize] = LocalM;
10060 else if (RepeatedMask[i % LaneSize] != LocalM)
10061 // Found a mismatch with the repeated mask.
10062 return false;
10063 }
10064 return true;
10065}
10066
10067/// Test whether a target shuffle mask is equivalent within each sub-lane.
10068/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10069static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
10070 ArrayRef<int> Mask,
10071 SmallVectorImpl<int> &RepeatedMask) {
10072 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
10073 Mask, RepeatedMask);
10074}
10075
10076/// Checks whether a shuffle mask is equivalent to an explicit list of
10077/// arguments.
10078///
10079/// This is a fast way to test a shuffle mask against a fixed pattern:
10080///
10081/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
10082///
10083/// It returns true if the mask is exactly as wide as the argument list, and
10084/// each element of the mask is either -1 (signifying undef) or the value given
10085/// in the argument.
10086static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
10087 SDValue V1 = SDValue(),
10088 SDValue V2 = SDValue()) {
10089 int Size = Mask.size();
10090 if (Size != (int)ExpectedMask.size())
10091 return false;
10092
10093 for (int i = 0; i < Size; ++i) {
10094 assert(Mask[i] >= -1 && "Out of bound mask element!");
10095 int MaskIdx = Mask[i];
10096 int ExpectedIdx = ExpectedMask[i];
10097 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
10098 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10099 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10100 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10101 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10102 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10103 return false;
10104 }
10105 }
10106 return true;
10107}
10108
10109/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
10110///
10111/// The masks must be exactly the same width.
10112///
10113/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
10114/// value in ExpectedMask is always accepted. Otherwise the indices must match.
10115///
10116/// SM_SentinelZero is accepted as a valid negative index but must match in
10117/// both, or via a known bits test.
10119 ArrayRef<int> ExpectedMask,
10120 const SelectionDAG &DAG,
10121 SDValue V1 = SDValue(),
10122 SDValue V2 = SDValue()) {
10123 int Size = Mask.size();
10124 if (Size != (int)ExpectedMask.size())
10125 return false;
10126 assert(llvm::all_of(ExpectedMask,
10127 [Size](int M) {
10128 return M == SM_SentinelZero ||
10129 isInRange(M, 0, 2 * Size);
10130 }) &&
10131 "Illegal target shuffle mask");
10132
10133 // Check for out-of-range target shuffle mask indices.
10134 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
10135 return false;
10136
10137 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
10138 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
10139 !V1.getValueType().isVector()))
10140 V1 = SDValue();
10141 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
10142 !V2.getValueType().isVector()))
10143 V2 = SDValue();
10144
10145 APInt ZeroV1 = APInt::getZero(Size);
10146 APInt ZeroV2 = APInt::getZero(Size);
10147
10148 for (int i = 0; i < Size; ++i) {
10149 int MaskIdx = Mask[i];
10150 int ExpectedIdx = ExpectedMask[i];
10151 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
10152 continue;
10153 // If we failed to match an expected SM_SentinelZero then early out.
10154 if (ExpectedIdx < 0)
10155 return false;
10156 if (MaskIdx == SM_SentinelZero) {
10157 // If we need this expected index to be a zero element, then update the
10158 // relevant zero mask and perform the known bits at the end to minimize
10159 // repeated computes.
10160 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10161 if (ExpectedV &&
10162 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
10163 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10164 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
10165 ZeroMask.setBit(BitIdx);
10166 continue;
10167 }
10168 }
10169 if (MaskIdx >= 0) {
10170 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10171 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10172 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10173 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10174 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10175 continue;
10176 }
10177 return false;
10178 }
10179 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
10180 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
10181}
10182
10183// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
10184// instructions.
10186 const SelectionDAG &DAG) {
10187 if (VT != MVT::v8i32 && VT != MVT::v8f32)
10188 return false;
10189
10190 SmallVector<int, 8> Unpcklwd;
10191 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
10192 /* Unary = */ false);
10193 SmallVector<int, 8> Unpckhwd;
10194 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
10195 /* Unary = */ false);
10196 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
10197 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
10198 return IsUnpackwdMask;
10199}
10200
10202 const SelectionDAG &DAG) {
10203 // Create 128-bit vector type based on mask size.
10204 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
10205 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
10206
10207 // We can't assume a canonical shuffle mask, so try the commuted version too.
10208 SmallVector<int, 4> CommutedMask(Mask);
10210
10211 // Match any of unary/binary or low/high.
10212 for (unsigned i = 0; i != 4; ++i) {
10213 SmallVector<int, 16> UnpackMask;
10214 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
10215 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
10216 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
10217 return true;
10218 }
10219 return false;
10220}
10221
10222/// Return true if a shuffle mask chooses elements identically in its top and
10223/// bottom halves. For example, any splat mask has the same top and bottom
10224/// halves. If an element is undefined in only one half of the mask, the halves
10225/// are not considered identical.
10227 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
10228 unsigned HalfSize = Mask.size() / 2;
10229 for (unsigned i = 0; i != HalfSize; ++i) {
10230 if (Mask[i] != Mask[i + HalfSize])
10231 return false;
10232 }
10233 return true;
10234}
10235
10236/// Get a 4-lane 8-bit shuffle immediate for a mask.
10237///
10238/// This helper function produces an 8-bit shuffle immediate corresponding to
10239/// the ubiquitous shuffle encoding scheme used in x86 instructions for
10240/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
10241/// example.
10242///
10243/// NB: We rely heavily on "undef" masks preserving the input lane.
10244static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
10245 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
10246 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
10247 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
10248 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
10249 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
10250
10251 // If the mask only uses one non-undef element, then fully 'splat' it to
10252 // improve later broadcast matching.
10253 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10254 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
10255
10256 int FirstElt = Mask[FirstIndex];
10257 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
10258 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10259
10260 unsigned Imm = 0;
10261 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10262 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10263 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10264 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10265 return Imm;
10266}
10267
10269 SelectionDAG &DAG) {
10270 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10271}
10272
10273// Canonicalize SHUFPD mask to improve chances of further folding.
10274// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10275static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
10276 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10277 "Unexpected SHUFPD mask size");
10278 assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10279 "Unexpected SHUFPD mask elements");
10280
10281 // If the mask only uses one non-undef element, then fully 'splat' it to
10282 // improve later broadcast matching.
10283 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10284 assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&
10285 "All undef shuffle mask");
10286
10287 int FirstElt = Mask[FirstIndex];
10288 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
10289 count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
10290 unsigned Imm = 0;
10291 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10292 Imm |= FirstElt << I;
10293 return Imm;
10294 }
10295
10296 // Attempt to keep any undef elements in place to improve chances of the
10297 // shuffle becoming a (commutative) blend.
10298 unsigned Imm = 0;
10299 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10300 Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
10301
10302 return Imm;
10303}
10304
10306 SelectionDAG &DAG) {
10307 return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
10308}
10309
10310// The Shuffle result is as follow:
10311// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10312// Each Zeroable's element correspond to a particular Mask's element.
10313// As described in computeZeroableShuffleElements function.
10314//
10315// The function looks for a sub-mask that the nonzero elements are in
10316// increasing order. If such sub-mask exist. The function returns true.
10317static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10318 ArrayRef<int> Mask, const EVT &VectorType,
10319 bool &IsZeroSideLeft) {
10320 int NextElement = -1;
10321 // Check if the Mask's nonzero elements are in increasing order.
10322 for (int i = 0, e = Mask.size(); i < e; i++) {
10323 // Checks if the mask's zeros elements are built from only zeros.
10324 assert(Mask[i] >= -1 && "Out of bound mask element!");
10325 if (Mask[i] < 0)
10326 return false;
10327 if (Zeroable[i])
10328 continue;
10329 // Find the lowest non zero element
10330 if (NextElement < 0) {
10331 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10332 IsZeroSideLeft = NextElement != 0;
10333 }
10334 // Exit if the mask's non zero elements are not in increasing order.
10335 if (NextElement != Mask[i])
10336 return false;
10337 NextElement++;
10338 }
10339 return true;
10340}
10341
10342static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
10344 const X86Subtarget &Subtarget,
10345 unsigned Depth = 0);
10346
10347/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10349 ArrayRef<int> Mask, SDValue V1,
10350 SDValue V2, const APInt &Zeroable,
10351 const X86Subtarget &Subtarget,
10352 SelectionDAG &DAG) {
10353 int Size = Mask.size();
10354 int LaneSize = 128 / VT.getScalarSizeInBits();
10355 const int NumBytes = VT.getSizeInBits() / 8;
10356 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10357
10358 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
10359 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
10360 (Subtarget.hasBWI() && VT.is512BitVector()));
10361
10362 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10363 // Sign bit set in i8 mask means zero element.
10364 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10365
10366 SDValue V;
10367 for (int i = 0; i < NumBytes; ++i) {
10368 int M = Mask[i / NumEltBytes];
10369 if (M < 0) {
10370 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10371 continue;
10372 }
10373 if (Zeroable[i / NumEltBytes]) {
10374 PSHUFBMask[i] = ZeroMask;
10375 continue;
10376 }
10377
10378 // We can only use a single input of V1 or V2.
10379 SDValue SrcV = (M >= Size ? V2 : V1);
10380 if (V && V != SrcV)
10381 return SDValue();
10382 V = SrcV;
10383 M %= Size;
10384
10385 // PSHUFB can't cross lanes, ensure this doesn't happen.
10386 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10387 return SDValue();
10388
10389 M = M % LaneSize;
10390 M = M * NumEltBytes + (i % NumEltBytes);
10391 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10392 }
10393 assert(V && "Failed to find a source input");
10394
10395 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10396 return DAG.getBitcast(
10397 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10398 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10399}
10400
10401static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10402 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10403 const SDLoc &dl);
10404
10405// X86 has dedicated shuffle that can be lowered to VEXPAND
10407 SDValue V2, ArrayRef<int> Mask,
10408 const APInt &Zeroable,
10409 const X86Subtarget &Subtarget,
10410 SelectionDAG &DAG) {
10411 bool IsLeftZeroSide = true;
10412 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10413 IsLeftZeroSide))
10414 return SDValue();
10415 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10417 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10418 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10419 unsigned NumElts = VT.getVectorNumElements();
10420 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10421 "Unexpected number of vector elements");
10422 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10423 Subtarget, DAG, DL);
10424 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10425 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10426 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10427}
10428
10429static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
10430 unsigned &UnpackOpcode, bool IsUnary,
10431 ArrayRef<int> TargetMask, const SDLoc &DL,
10432 SelectionDAG &DAG,
10433 const X86Subtarget &Subtarget) {
10434 int NumElts = VT.getVectorNumElements();
10435
10436 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
10437 for (int i = 0; i != NumElts; i += 2) {
10438 int M1 = TargetMask[i + 0];
10439 int M2 = TargetMask[i + 1];
10440 Undef1 &= (SM_SentinelUndef == M1);
10441 Undef2 &= (SM_SentinelUndef == M2);
10442 Zero1 &= isUndefOrZero(M1);
10443 Zero2 &= isUndefOrZero(M2);
10444 }
10445 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
10446 "Zeroable shuffle detected");
10447
10448 // Attempt to match the target mask against the unpack lo/hi mask patterns.
10449 SmallVector<int, 64> Unpckl, Unpckh;
10450 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
10451 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
10452 (IsUnary ? V1 : V2))) {
10453 UnpackOpcode = X86ISD::UNPCKL;
10454 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10455 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10456 return true;
10457 }
10458
10459 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
10460 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
10461 (IsUnary ? V1 : V2))) {
10462 UnpackOpcode = X86ISD::UNPCKH;
10463 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10464 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10465 return true;
10466 }
10467
10468 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
10469 if (IsUnary && (Zero1 || Zero2)) {
10470 // Don't bother if we can blend instead.
10471 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10472 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
10473 return false;
10474
10475 bool MatchLo = true, MatchHi = true;
10476 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10477 int M = TargetMask[i];
10478
10479 // Ignore if the input is known to be zero or the index is undef.
10480 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10481 (M == SM_SentinelUndef))
10482 continue;
10483
10484 MatchLo &= (M == Unpckl[i]);
10485 MatchHi &= (M == Unpckh[i]);
10486 }
10487
10488 if (MatchLo || MatchHi) {
10489 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10490 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10491 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10492 return true;
10493 }
10494 }
10495
10496 // If a binary shuffle, commute and try again.
10497 if (!IsUnary) {
10499 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
10500 UnpackOpcode = X86ISD::UNPCKL;
10501 std::swap(V1, V2);
10502 return true;
10503 }
10504
10506 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
10507 UnpackOpcode = X86ISD::UNPCKH;
10508 std::swap(V1, V2);
10509 return true;
10510 }
10511 }
10512
10513 return false;
10514}
10515
10516// X86 has dedicated unpack instructions that can handle specific blend
10517// operations: UNPCKH and UNPCKL.
10519 SDValue V2, ArrayRef<int> Mask,
10520 SelectionDAG &DAG) {
10521 SmallVector<int, 8> Unpckl;
10522 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10523 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10524 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10525
10526 SmallVector<int, 8> Unpckh;
10527 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10528 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10529 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10530
10531 // Commute and try again.
10533 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10534 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10535
10537 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10538 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10539
10540 return SDValue();
10541}
10542
10543/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10544/// followed by unpack 256-bit.
10546 SDValue V2, ArrayRef<int> Mask,
10547 SelectionDAG &DAG) {
10548 SmallVector<int, 32> Unpckl, Unpckh;
10549 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10550 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10551
10552 unsigned UnpackOpcode;
10553 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10554 UnpackOpcode = X86ISD::UNPCKL;
10555 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10556 UnpackOpcode = X86ISD::UNPCKH;
10557 else
10558 return SDValue();
10559
10560 // This is a "natural" unpack operation (rather than the 128-bit sectored
10561 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10562 // input in order to use the x86 instruction.
10563 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10564 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10565 V1 = DAG.getBitcast(VT, V1);
10566 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10567}
10568
10569// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10570// source into the lower elements and zeroing the upper elements.
10571static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10572 ArrayRef<int> Mask, const APInt &Zeroable,
10573 const X86Subtarget &Subtarget) {
10574 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10575 return false;
10576
10577 unsigned NumElts = Mask.size();
10578 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10579 unsigned MaxScale = 64 / EltSizeInBits;
10580
10581 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10582 unsigned SrcEltBits = EltSizeInBits * Scale;
10583 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10584 continue;
10585 unsigned NumSrcElts = NumElts / Scale;
10586 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10587 continue;
10588 unsigned UpperElts = NumElts - NumSrcElts;
10589 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10590 continue;
10591 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10592 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10593 DstVT = MVT::getIntegerVT(EltSizeInBits);
10594 if ((NumSrcElts * EltSizeInBits) >= 128) {
10595 // ISD::TRUNCATE
10596 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10597 } else {
10598 // X86ISD::VTRUNC
10599 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10600 }
10601 return true;
10602 }
10603
10604 return false;
10605}
10606
10607// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10608// element padding to the final DstVT.
10609static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10610 const X86Subtarget &Subtarget,
10611 SelectionDAG &DAG, bool ZeroUppers) {
10612 MVT SrcVT = Src.getSimpleValueType();
10613 MVT DstSVT = DstVT.getScalarType();
10614 unsigned NumDstElts = DstVT.getVectorNumElements();
10615 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10616 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10617
10618 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10619 return SDValue();
10620
10621 // Perform a direct ISD::TRUNCATE if possible.
10622 if (NumSrcElts == NumDstElts)
10623 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10624
10625 if (NumSrcElts > NumDstElts) {
10626 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10627 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10628 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10629 }
10630
10631 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10632 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10633 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10634 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10635 DstVT.getSizeInBits());
10636 }
10637
10638 // Non-VLX targets must truncate from a 512-bit type, so we need to
10639 // widen, truncate and then possibly extract the original subvector.
10640 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10641 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10642 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10643 }
10644
10645 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10646 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10647 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10648 if (DstVT != TruncVT)
10649 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10650 DstVT.getSizeInBits());
10651 return Trunc;
10652}
10653
10654// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10655//
10656// An example is the following:
10657//
10658// t0: ch = EntryToken
10659// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10660// t25: v4i32 = truncate t2
10661// t41: v8i16 = bitcast t25
10662// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10663// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10664// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10665// t18: v2i64 = bitcast t51
10666//
10667// One can just use a single vpmovdw instruction, without avx512vl we need to
10668// use the zmm variant and extract the lower subvector, padding with zeroes.
10669// TODO: Merge with lowerShuffleAsVTRUNC.
10671 SDValue V2, ArrayRef<int> Mask,
10672 const APInt &Zeroable,
10673 const X86Subtarget &Subtarget,
10674 SelectionDAG &DAG) {
10675 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10676 if (!Subtarget.hasAVX512())
10677 return SDValue();
10678
10679 unsigned NumElts = VT.getVectorNumElements();
10680 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10681 unsigned MaxScale = 64 / EltSizeInBits;
10682 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10683 unsigned SrcEltBits = EltSizeInBits * Scale;
10684 unsigned NumSrcElts = NumElts / Scale;
10685 unsigned UpperElts = NumElts - NumSrcElts;
10686 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10687 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10688 continue;
10689
10690 // Attempt to find a matching source truncation, but as a fall back VLX
10691 // cases can use the VPMOV directly.
10692 SDValue Src = peekThroughBitcasts(V1);
10693 if (Src.getOpcode() == ISD::TRUNCATE &&
10694 Src.getScalarValueSizeInBits() == SrcEltBits) {
10695 Src = Src.getOperand(0);
10696 } else if (Subtarget.hasVLX()) {
10697 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10698 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10699 Src = DAG.getBitcast(SrcVT, Src);
10700 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10701 if (Scale == 2 &&
10702 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10703 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10704 return SDValue();
10705 } else
10706 return SDValue();
10707
10708 // VPMOVWB is only available with avx512bw.
10709 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10710 return SDValue();
10711
10712 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10713 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10714 }
10715
10716 return SDValue();
10717}
10718
10719// Attempt to match binary shuffle patterns as a truncate.
10721 SDValue V2, ArrayRef<int> Mask,
10722 const APInt &Zeroable,
10723 const X86Subtarget &Subtarget,
10724 SelectionDAG &DAG) {
10725 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10726 "Unexpected VTRUNC type");
10727 if (!Subtarget.hasAVX512() ||
10728 (VT.is256BitVector() && !Subtarget.useAVX512Regs()))
10729 return SDValue();
10730
10731 unsigned NumElts = VT.getVectorNumElements();
10732 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10733 unsigned MaxScale = 64 / EltSizeInBits;
10734 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10735 // TODO: Support non-BWI VPMOVWB truncations?
10736 unsigned SrcEltBits = EltSizeInBits * Scale;
10737 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10738 continue;
10739
10740 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10741 // Bail if the V2 elements are undef.
10742 unsigned NumHalfSrcElts = NumElts / Scale;
10743 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10744 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10745 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10746 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10747 continue;
10748
10749 // The elements beyond the truncation must be undef/zero.
10750 unsigned UpperElts = NumElts - NumSrcElts;
10751 if (UpperElts > 0 &&
10752 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10753 continue;
10754 bool UndefUppers =
10755 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10756
10757 // As we're using both sources then we need to concat them together
10758 // and truncate from the double-sized src.
10759 MVT ConcatVT = VT.getDoubleNumVectorElementsVT();
10760
10761 // For offset truncations, ensure that the concat is cheap.
10762 SDValue Src =
10763 combineConcatVectorOps(DL, ConcatVT, {V1, V2}, DAG, Subtarget);
10764 if (!Src) {
10765 if (Offset)
10766 continue;
10767 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10768 }
10769
10770 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10771 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10772 Src = DAG.getBitcast(SrcVT, Src);
10773
10774 // Shift the offset'd elements into place for the truncation.
10775 // TODO: Use getTargetVShiftByConstNode.
10776 if (Offset)
10777 Src = DAG.getNode(
10778 X86ISD::VSRLI, DL, SrcVT, Src,
10779 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10780
10781 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10782 }
10783 }
10784
10785 return SDValue();
10786}
10787
10788/// Check whether a compaction lowering can be done by dropping even/odd
10789/// elements and compute how many times even/odd elements must be dropped.
10790///
10791/// This handles shuffles which take every Nth element where N is a power of
10792/// two. Example shuffle masks:
10793///
10794/// (even)
10795/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10796/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10797/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10798/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10799/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10800/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10801///
10802/// (odd)
10803/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10804/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10805///
10806/// Any of these lanes can of course be undef.
10807///
10808/// This routine only supports N <= 3.
10809/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10810/// for larger N.
10811///
10812/// \returns N above, or the number of times even/odd elements must be dropped
10813/// if there is such a number. Otherwise returns zero.
10814static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10815 bool IsSingleInput) {
10816 // The modulus for the shuffle vector entries is based on whether this is
10817 // a single input or not.
10818 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10819 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10820 "We should only be called with masks with a power-of-2 size!");
10821
10822 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10823 int Offset = MatchEven ? 0 : 1;
10824
10825 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10826 // and 2^3 simultaneously. This is because we may have ambiguity with
10827 // partially undef inputs.
10828 bool ViableForN[3] = {true, true, true};
10829
10830 for (int i = 0, e = Mask.size(); i < e; ++i) {
10831 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10832 // want.
10833 if (Mask[i] < 0)
10834 continue;
10835
10836 bool IsAnyViable = false;
10837 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10838 if (ViableForN[j]) {
10839 uint64_t N = j + 1;
10840
10841 // The shuffle mask must be equal to (i * 2^N) % M.
10842 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10843 IsAnyViable = true;
10844 else
10845 ViableForN[j] = false;
10846 }
10847 // Early exit if we exhaust the possible powers of two.
10848 if (!IsAnyViable)
10849 break;
10850 }
10851
10852 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10853 if (ViableForN[j])
10854 return j + 1;
10855
10856 // Return 0 as there is no viable power of two.
10857 return 0;
10858}
10859
10860// X86 has dedicated pack instructions that can handle specific truncation
10861// operations: PACKSS and PACKUS.
10862// Checks for compaction shuffle masks if MaxStages > 1.
10863// TODO: Add support for matching multiple PACKSS/PACKUS stages.
10864static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10865 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10866 const SelectionDAG &DAG,
10867 const X86Subtarget &Subtarget,
10868 unsigned MaxStages = 1) {
10869 unsigned NumElts = VT.getVectorNumElements();
10870 unsigned BitSize = VT.getScalarSizeInBits();
10871 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10872 "Illegal maximum compaction");
10873
10874 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10875 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10876 unsigned NumPackedBits = NumSrcBits - BitSize;
10877 N1 = peekThroughBitcasts(N1);
10878 N2 = peekThroughBitcasts(N2);
10879 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10880 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10881 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10882 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10883 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10884 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10885 return false;
10886 if (Subtarget.hasSSE41() || BitSize == 8) {
10887 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10888 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10889 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10890 V1 = N1;
10891 V2 = N2;
10892 SrcVT = PackVT;
10893 PackOpcode = X86ISD::PACKUS;
10894 return true;
10895 }
10896 }
10897 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10898 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10899 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10900 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10901 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10902 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10903 V1 = N1;
10904 V2 = N2;
10905 SrcVT = PackVT;
10906 PackOpcode = X86ISD::PACKSS;
10907 return true;
10908 }
10909 return false;
10910 };
10911
10912 // Attempt to match against wider and wider compaction patterns.
10913 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10914 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10915 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10916
10917 // Try binary shuffle.
10918 SmallVector<int, 32> BinaryMask;
10919 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10920 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10921 if (MatchPACK(V1, V2, PackVT))
10922 return true;
10923
10924 // Try unary shuffle.
10925 SmallVector<int, 32> UnaryMask;
10926 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10927 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10928 if (MatchPACK(V1, V1, PackVT))
10929 return true;
10930 }
10931
10932 return false;
10933}
10934
10936 SDValue V2, ArrayRef<int> Mask,
10937 const X86Subtarget &Subtarget,
10938 SelectionDAG &DAG) {
10939 MVT PackVT;
10940 unsigned PackOpcode;
10941 unsigned SizeBits = VT.getSizeInBits();
10942 unsigned EltBits = VT.getScalarSizeInBits();
10943 unsigned MaxStages = Log2_32(64 / EltBits);
10944 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10945 Subtarget, MaxStages))
10946 return SDValue();
10947
10948 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10949 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10950
10951 // Don't lower multi-stage packs on AVX512, truncation is better.
10952 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10953 return SDValue();
10954
10955 // Pack to the largest type possible:
10956 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10957 unsigned MaxPackBits = 16;
10958 if (CurrentEltBits > 16 &&
10959 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10960 MaxPackBits = 32;
10961
10962 // Repeatedly pack down to the target size.
10963 SDValue Res;
10964 for (unsigned i = 0; i != NumStages; ++i) {
10965 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10966 unsigned NumSrcElts = SizeBits / SrcEltBits;
10967 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10968 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10969 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10970 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10971 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10972 DAG.getBitcast(SrcVT, V2));
10973 V1 = V2 = Res;
10974 CurrentEltBits /= 2;
10975 }
10976 assert(Res && Res.getValueType() == VT &&
10977 "Failed to lower compaction shuffle");
10978 return Res;
10979}
10980
10981/// Try to emit a bitmask instruction for a shuffle.
10982///
10983/// This handles cases where we can model a blend exactly as a bitmask due to
10984/// one of the inputs being zeroable.
10986 SDValue V2, ArrayRef<int> Mask,
10987 const APInt &Zeroable,
10988 const X86Subtarget &Subtarget,
10989 SelectionDAG &DAG) {
10990 MVT MaskVT = VT;
10991 MVT EltVT = VT.getVectorElementType();
10992 SDValue Zero, AllOnes;
10993 // Use f64 if i64 isn't legal.
10994 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
10995 EltVT = MVT::f64;
10996 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
10997 }
10998
10999 MVT LogicVT = VT;
11000 if (EltVT.isFloatingPoint()) {
11001 Zero = DAG.getConstantFP(0.0, DL, EltVT);
11002 APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics());
11003 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11004 LogicVT = MVT::getVectorVT(EltVT.changeTypeToInteger(), Mask.size());
11005 } else {
11006 Zero = DAG.getConstant(0, DL, EltVT);
11007 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11008 }
11009
11010 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11011 SDValue V;
11012 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11013 if (Zeroable[i])
11014 continue;
11015 if (Mask[i] % Size != i)
11016 return SDValue(); // Not a blend.
11017 if (!V)
11018 V = Mask[i] < Size ? V1 : V2;
11019 else if (V != (Mask[i] < Size ? V1 : V2))
11020 return SDValue(); // Can only let one input through the mask.
11021
11022 VMaskOps[i] = AllOnes;
11023 }
11024 if (!V)
11025 return SDValue(); // No non-zeroable elements!
11026
11027 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11028 VMask = DAG.getBitcast(LogicVT, VMask);
11029 V = DAG.getBitcast(LogicVT, V);
11030 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11031 return DAG.getBitcast(VT, And);
11032}
11033
11034/// Try to emit a blend instruction for a shuffle using bit math.
11035///
11036/// This is used as a fallback approach when first class blend instructions are
11037/// unavailable. Currently it is only suitable for integer vectors, but could
11038/// be generalized for floating point vectors if desirable.
11040 SDValue V2, ArrayRef<int> Mask,
11041 SelectionDAG &DAG) {
11042 assert(VT.isInteger() && "Only supports integer vector types!");
11043 MVT EltVT = VT.getVectorElementType();
11044 SDValue Zero = DAG.getConstant(0, DL, EltVT);
11045 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11047 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11048 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
11049 return SDValue(); // Shuffled input!
11050 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
11051 }
11052
11053 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
11054 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
11055}
11056
11058 SDValue PreservedSrc,
11059 const X86Subtarget &Subtarget,
11060 SelectionDAG &DAG);
11061
11064 const APInt &Zeroable, bool &ForceV1Zero,
11065 bool &ForceV2Zero, uint64_t &BlendMask) {
11066 bool V1IsZeroOrUndef =
11068 bool V2IsZeroOrUndef =
11070
11071 BlendMask = 0;
11072 ForceV1Zero = false, ForceV2Zero = false;
11073 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
11074
11075 int NumElts = Mask.size();
11076 int NumLanes = VT.getSizeInBits() / 128;
11077 int NumEltsPerLane = NumElts / NumLanes;
11078 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
11079
11080 // For 32/64-bit elements, if we only reference one input (plus any undefs),
11081 // then ensure the blend mask part for that lane just references that input.
11082 bool ForceWholeLaneMasks =
11083 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
11084
11085 // Attempt to generate the binary blend mask. If an input is zero then
11086 // we can use any lane.
11087 for (int Lane = 0; Lane != NumLanes; ++Lane) {
11088 // Keep track of the inputs used per lane.
11089 bool LaneV1InUse = false;
11090 bool LaneV2InUse = false;
11091 uint64_t LaneBlendMask = 0;
11092 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
11093 int Elt = (Lane * NumEltsPerLane) + LaneElt;
11094 int M = Mask[Elt];
11095 if (M == SM_SentinelUndef)
11096 continue;
11097 if (M == Elt || (0 <= M && M < NumElts &&
11098 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
11099 Mask[Elt] = Elt;
11100 LaneV1InUse = true;
11101 continue;
11102 }
11103 if (M == (Elt + NumElts) ||
11104 (NumElts <= M &&
11105 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
11106 LaneBlendMask |= 1ull << LaneElt;
11107 Mask[Elt] = Elt + NumElts;
11108 LaneV2InUse = true;
11109 continue;
11110 }
11111 if (Zeroable[Elt]) {
11112 if (V1IsZeroOrUndef) {
11113 ForceV1Zero = true;
11114 Mask[Elt] = Elt;
11115 LaneV1InUse = true;
11116 continue;
11117 }
11118 if (V2IsZeroOrUndef) {
11119 ForceV2Zero = true;
11120 LaneBlendMask |= 1ull << LaneElt;
11121 Mask[Elt] = Elt + NumElts;
11122 LaneV2InUse = true;
11123 continue;
11124 }
11125 }
11126 return false;
11127 }
11128
11129 // If we only used V2 then splat the lane blend mask to avoid any demanded
11130 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
11131 // blend mask bit).
11132 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
11133 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
11134
11135 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
11136 }
11137 return true;
11138}
11139
11140/// Try to emit a blend instruction for a shuffle.
11141///
11142/// This doesn't do any checks for the availability of instructions for blending
11143/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
11144/// be matched in the backend with the type given. What it does check for is
11145/// that the shuffle mask is a blend, or convertible into a blend with zero.
11147 SDValue V2, ArrayRef<int> Original,
11148 const APInt &Zeroable,
11149 const X86Subtarget &Subtarget,
11150 SelectionDAG &DAG) {
11151 uint64_t BlendMask = 0;
11152 bool ForceV1Zero = false, ForceV2Zero = false;
11153 SmallVector<int, 64> Mask(Original);
11154 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
11155 BlendMask))
11156 return SDValue();
11157
11158 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
11159 if (ForceV1Zero)
11160 V1 = getZeroVector(VT, Subtarget, DAG, DL);
11161 if (ForceV2Zero)
11162 V2 = getZeroVector(VT, Subtarget, DAG, DL);
11163
11164 unsigned NumElts = VT.getVectorNumElements();
11165
11166 switch (VT.SimpleTy) {
11167 case MVT::v4i64:
11168 case MVT::v8i32:
11169 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
11170 [[fallthrough]];
11171 case MVT::v4f64:
11172 case MVT::v8f32:
11173 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
11174 [[fallthrough]];
11175 case MVT::v2f64:
11176 case MVT::v2i64:
11177 case MVT::v4f32:
11178 case MVT::v4i32:
11179 case MVT::v8i16:
11180 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
11181 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
11182 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11183 case MVT::v16i16: {
11184 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
11185 SmallVector<int, 8> RepeatedMask;
11186 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11187 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
11188 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
11189 BlendMask = 0;
11190 for (int i = 0; i < 8; ++i)
11191 if (RepeatedMask[i] >= 8)
11192 BlendMask |= 1ull << i;
11193 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11194 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11195 }
11196 // Use PBLENDW for lower/upper lanes and then blend lanes.
11197 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
11198 // merge to VSELECT where useful.
11199 uint64_t LoMask = BlendMask & 0xFF;
11200 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11201 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11202 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11203 DAG.getTargetConstant(LoMask, DL, MVT::i8));
11204 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11205 DAG.getTargetConstant(HiMask, DL, MVT::i8));
11206 return DAG.getVectorShuffle(
11207 MVT::v16i16, DL, Lo, Hi,
11208 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11209 }
11210 [[fallthrough]];
11211 }
11212 case MVT::v32i8:
11213 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
11214 [[fallthrough]];
11215 case MVT::v16i8: {
11216 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
11217
11218 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
11219 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11220 Subtarget, DAG))
11221 return Masked;
11222
11223 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11224 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11225 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11226 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11227 }
11228
11229 // If we have VPTERNLOG, we can use that as a bit blend.
11230 if (Subtarget.hasVLX())
11231 if (SDValue BitBlend =
11232 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11233 return BitBlend;
11234
11235 // Scale the blend by the number of bytes per element.
11236 int Scale = VT.getScalarSizeInBits() / 8;
11237
11238 // This form of blend is always done on bytes. Compute the byte vector
11239 // type.
11240 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11241
11242 // x86 allows load folding with blendvb from the 2nd source operand. But
11243 // we are still using LLVM select here (see comment below), so that's V1.
11244 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11245 // allow that load-folding possibility.
11246 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11248 std::swap(V1, V2);
11249 }
11250
11251 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11252 // mix of LLVM's code generator and the x86 backend. We tell the code
11253 // generator that boolean values in the elements of an x86 vector register
11254 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11255 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11256 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11257 // of the element (the remaining are ignored) and 0 in that high bit would
11258 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11259 // the LLVM model for boolean values in vector elements gets the relevant
11260 // bit set, it is set backwards and over constrained relative to x86's
11261 // actual model.
11262 SmallVector<SDValue, 32> VSELECTMask;
11263 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11264 for (int j = 0; j < Scale; ++j)
11265 VSELECTMask.push_back(
11266 Mask[i] < 0
11267 ? DAG.getUNDEF(MVT::i8)
11268 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11269
11270 V1 = DAG.getBitcast(BlendVT, V1);
11271 V2 = DAG.getBitcast(BlendVT, V2);
11272 return DAG.getBitcast(
11273 VT,
11274 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11275 V1, V2));
11276 }
11277 case MVT::v16f32:
11278 case MVT::v8f64:
11279 case MVT::v8i64:
11280 case MVT::v16i32:
11281 case MVT::v32i16:
11282 case MVT::v64i8: {
11283 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11284 bool OptForSize = DAG.shouldOptForSize();
11285 if (!OptForSize) {
11286 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11287 Subtarget, DAG))
11288 return Masked;
11289 }
11290
11291 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11292 // masked move.
11293 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11294 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11295 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11296 }
11297 default:
11298 llvm_unreachable("Not a supported integer vector type!");
11299 }
11300}
11301
11302/// Try to lower as a blend of elements from two inputs followed by
11303/// a single-input permutation.
11304///
11305/// This matches the pattern where we can blend elements from two inputs and
11306/// then reduce the shuffle to a single-input permutation.
11308 SDValue V1, SDValue V2,
11309 ArrayRef<int> Mask,
11310 SelectionDAG &DAG,
11311 bool ImmBlends = false) {
11312 // We build up the blend mask while checking whether a blend is a viable way
11313 // to reduce the shuffle.
11314 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11315 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11316
11317 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11318 if (Mask[i] < 0)
11319 continue;
11320
11321 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
11322
11323 if (BlendMask[Mask[i] % Size] < 0)
11324 BlendMask[Mask[i] % Size] = Mask[i];
11325 else if (BlendMask[Mask[i] % Size] != Mask[i])
11326 return SDValue(); // Can't blend in the needed input!
11327
11328 PermuteMask[i] = Mask[i] % Size;
11329 }
11330
11331 // If only immediate blends, then bail if the blend mask can't be widened to
11332 // i16.
11333 unsigned EltSize = VT.getScalarSizeInBits();
11334 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11335 return SDValue();
11336
11337 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11338 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11339}
11340
11341/// Try to lower as an unpack of elements from two inputs followed by
11342/// a single-input permutation.
11343///
11344/// This matches the pattern where we can unpack elements from two inputs and
11345/// then reduce the shuffle to a single-input (wider) permutation.
11347 SDValue V1, SDValue V2,
11348 ArrayRef<int> Mask,
11349 SelectionDAG &DAG) {
11350 int NumElts = Mask.size();
11351 int NumLanes = VT.getSizeInBits() / 128;
11352 int NumLaneElts = NumElts / NumLanes;
11353 int NumHalfLaneElts = NumLaneElts / 2;
11354
11355 bool MatchLo = true, MatchHi = true;
11356 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11357
11358 // Determine UNPCKL/UNPCKH type and operand order.
11359 for (int Elt = 0; Elt != NumElts; ++Elt) {
11360 int M = Mask[Elt];
11361 if (M < 0)
11362 continue;
11363
11364 // Normalize the mask value depending on whether it's V1 or V2.
11365 int NormM = M;
11366 SDValue &Op = Ops[Elt & 1];
11367 if (M < NumElts && (Op.isUndef() || Op == V1))
11368 Op = V1;
11369 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
11370 Op = V2;
11371 NormM -= NumElts;
11372 } else
11373 return SDValue();
11374
11375 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
11376 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11377 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11378 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
11379 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
11380 if (MatchLoAnyLane || MatchHiAnyLane) {
11381 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11382 "Failed to match UNPCKLO/UNPCKHI");
11383 break;
11384 }
11385 }
11386 MatchLo &= MatchLoAnyLane;
11387 MatchHi &= MatchHiAnyLane;
11388 if (!MatchLo && !MatchHi)
11389 return SDValue();
11390 }
11391 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
11392
11393 // Element indices have changed after unpacking. Calculate permute mask
11394 // so that they will be put back to the position as dictated by the
11395 // original shuffle mask indices.
11396 SmallVector<int, 32> PermuteMask(NumElts, -1);
11397 for (int Elt = 0; Elt != NumElts; ++Elt) {
11398 int M = Mask[Elt];
11399 if (M < 0)
11400 continue;
11401 int NormM = M;
11402 if (NumElts <= M)
11403 NormM -= NumElts;
11404 bool IsFirstOp = M < NumElts;
11405 int BaseMaskElt =
11406 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11407 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
11408 PermuteMask[Elt] = BaseMaskElt;
11409 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
11410 PermuteMask[Elt] = BaseMaskElt + 1;
11411 assert(PermuteMask[Elt] != -1 &&
11412 "Input mask element is defined but failed to assign permute mask");
11413 }
11414
11415 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11416 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11417 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11418}
11419
11420/// Try to lower a shuffle as a permute of the inputs followed by an
11421/// UNPCK instruction.
11422///
11423/// This specifically targets cases where we end up with alternating between
11424/// the two inputs, and so can permute them into something that feeds a single
11425/// UNPCK instruction. Note that this routine only targets integer vectors
11426/// because for floating point vectors we have a generalized SHUFPS lowering
11427/// strategy that handles everything that doesn't *exactly* match an unpack,
11428/// making this clever lowering unnecessary.
11430 SDValue V1, SDValue V2,
11431 ArrayRef<int> Mask,
11432 const X86Subtarget &Subtarget,
11433 SelectionDAG &DAG) {
11434 int Size = Mask.size();
11435 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11436
11437 // This routine only supports 128-bit integer dual input vectors.
11438 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
11439 return SDValue();
11440
11441 int NumLoInputs =
11442 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11443 int NumHiInputs =
11444 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11445
11446 bool UnpackLo = NumLoInputs >= NumHiInputs;
11447
11448 auto TryUnpack = [&](int ScalarSize, int Scale) {
11449 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11450 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11451
11452 for (int i = 0; i < Size; ++i) {
11453 if (Mask[i] < 0)
11454 continue;
11455
11456 // Each element of the unpack contains Scale elements from this mask.
11457 int UnpackIdx = i / Scale;
11458
11459 // We only handle the case where V1 feeds the first slots of the unpack.
11460 // We rely on canonicalization to ensure this is the case.
11461 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11462 return SDValue();
11463
11464 // Setup the mask for this input. The indexing is tricky as we have to
11465 // handle the unpack stride.
11466 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11467 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11468 Mask[i] % Size;
11469 }
11470
11471 // If we will have to shuffle both inputs to use the unpack, check whether
11472 // we can just unpack first and shuffle the result. If so, skip this unpack.
11473 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11474 !isNoopShuffleMask(V2Mask))
11475 return SDValue();
11476
11477 // Shuffle the inputs into place.
11478 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11479 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11480
11481 // Cast the inputs to the type we will use to unpack them.
11482 MVT UnpackVT =
11483 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11484 V1 = DAG.getBitcast(UnpackVT, V1);
11485 V2 = DAG.getBitcast(UnpackVT, V2);
11486
11487 // Unpack the inputs and cast the result back to the desired type.
11488 return DAG.getBitcast(
11489 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11490 UnpackVT, V1, V2));
11491 };
11492
11493 // We try each unpack from the largest to the smallest to try and find one
11494 // that fits this mask.
11495 int OrigScalarSize = VT.getScalarSizeInBits();
11496 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11497 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11498 return Unpack;
11499
11500 // If we're shuffling with a zero vector then we're better off not doing
11501 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
11504 return SDValue();
11505
11506 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11507 // initial unpack.
11508 if (NumLoInputs == 0 || NumHiInputs == 0) {
11509 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11510 "We have to have *some* inputs!");
11511 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11512
11513 // FIXME: We could consider the total complexity of the permute of each
11514 // possible unpacking. Or at the least we should consider how many
11515 // half-crossings are created.
11516 // FIXME: We could consider commuting the unpacks.
11517
11518 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11519 for (int i = 0; i < Size; ++i) {
11520 if (Mask[i] < 0)
11521 continue;
11522
11523 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11524
11525 PermMask[i] =
11526 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11527 }
11528 return DAG.getVectorShuffle(
11529 VT, DL,
11530 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11531 V1, V2),
11532 DAG.getUNDEF(VT), PermMask);
11533 }
11534
11535 return SDValue();
11536}
11537
11538/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11539/// permuting the elements of the result in place.
11541 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11542 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11543 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11544 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11545 (VT.is512BitVector() && !Subtarget.hasBWI()))
11546 return SDValue();
11547
11548 // We don't currently support lane crossing permutes.
11549 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11550 return SDValue();
11551
11552 int Scale = VT.getScalarSizeInBits() / 8;
11553 int NumLanes = VT.getSizeInBits() / 128;
11554 int NumElts = VT.getVectorNumElements();
11555 int NumEltsPerLane = NumElts / NumLanes;
11556
11557 // Determine range of mask elts.
11558 bool Blend1 = true;
11559 bool Blend2 = true;
11560 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11561 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11562 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11563 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11564 int M = Mask[Lane + Elt];
11565 if (M < 0)
11566 continue;
11567 if (M < NumElts) {
11568 Blend1 &= (M == (Lane + Elt));
11569 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11570 M = M % NumEltsPerLane;
11571 Range1.first = std::min(Range1.first, M);
11572 Range1.second = std::max(Range1.second, M);
11573 } else {
11574 M -= NumElts;
11575 Blend2 &= (M == (Lane + Elt));
11576 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11577 M = M % NumEltsPerLane;
11578 Range2.first = std::min(Range2.first, M);
11579 Range2.second = std::max(Range2.second, M);
11580 }
11581 }
11582 }
11583
11584 // Bail if we don't need both elements.
11585 // TODO - it might be worth doing this for unary shuffles if the permute
11586 // can be widened.
11587 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11588 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11589 return SDValue();
11590
11591 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11592 return SDValue();
11593
11594 // Rotate the 2 ops so we can access both ranges, then permute the result.
11595 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11596 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11597 SDValue Rotate = DAG.getBitcast(
11598 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11599 DAG.getBitcast(ByteVT, Lo),
11600 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11601 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11602 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11603 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11604 int M = Mask[Lane + Elt];
11605 if (M < 0)
11606 continue;
11607 if (M < NumElts)
11608 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11609 else
11610 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11611 }
11612 }
11613 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11614 };
11615
11616 // Check if the ranges are small enough to rotate from either direction.
11617 if (Range2.second < Range1.first)
11618 return RotateAndPermute(V1, V2, Range1.first, 0);
11619 if (Range1.second < Range2.first)
11620 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11621 return SDValue();
11622}
11623
11625 return isUndefOrEqual(Mask, 0);
11626}
11627
11629 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11630}
11631
11632/// Check if the Mask consists of the same element repeated multiple times.
11634 size_t NumUndefs = 0;
11635 std::optional<int> UniqueElt;
11636 for (int Elt : Mask) {
11637 if (Elt == SM_SentinelUndef) {
11638 NumUndefs++;
11639 continue;
11640 }
11641 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11642 return false;
11643 UniqueElt = Elt;
11644 }
11645 // Make sure the element is repeated enough times by checking the number of
11646 // undefs is small.
11647 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11648}
11649
11650/// Generic routine to decompose a shuffle and blend into independent
11651/// blends and permutes.
11652///
11653/// This matches the extremely common pattern for handling combined
11654/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11655/// operations. It will try to pick the best arrangement of shuffles and
11656/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11658 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11659 const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11660 int NumElts = Mask.size();
11661 int NumLanes = VT.getSizeInBits() / 128;
11662 int NumEltsPerLane = NumElts / NumLanes;
11663
11664 // Shuffle the input elements into the desired positions in V1 and V2 and
11665 // unpack/blend them together.
11666 bool IsAlternating = true;
11667 bool V1Zero = true, V2Zero = true;
11668 SmallVector<int, 32> V1Mask(NumElts, -1);
11669 SmallVector<int, 32> V2Mask(NumElts, -1);
11670 SmallVector<int, 32> FinalMask(NumElts, -1);
11671 for (int i = 0; i < NumElts; ++i) {
11672 int M = Mask[i];
11673 if (M >= 0 && M < NumElts) {
11674 V1Mask[i] = M;
11675 FinalMask[i] = i;
11676 V1Zero &= Zeroable[i];
11677 IsAlternating &= (i & 1) == 0;
11678 } else if (M >= NumElts) {
11679 V2Mask[i] = M - NumElts;
11680 FinalMask[i] = i + NumElts;
11681 V2Zero &= Zeroable[i];
11682 IsAlternating &= (i & 1) == 1;
11683 }
11684 }
11685
11686 // If we effectively only demand the 0'th element of \p Input, and not only
11687 // as 0'th element, then broadcast said input,
11688 // and change \p InputMask to be a no-op (identity) mask.
11689 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11690 &DAG](SDValue &Input,
11691 MutableArrayRef<int> InputMask) {
11692 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11693 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11694 !X86::mayFoldLoad(Input, Subtarget)))
11695 return;
11696 if (isNoopShuffleMask(InputMask))
11697 return;
11698 assert(isBroadcastShuffleMask(InputMask) &&
11699 "Expected to demand only the 0'th element.");
11701 for (auto I : enumerate(InputMask)) {
11702 int &InputMaskElt = I.value();
11703 if (InputMaskElt >= 0)
11704 InputMaskElt = I.index();
11705 }
11706 };
11707
11708 // Currently, we may need to produce one shuffle per input, and blend results.
11709 // It is possible that the shuffle for one of the inputs is already a no-op.
11710 // See if we can simplify non-no-op shuffles into broadcasts,
11711 // which we consider to be strictly better than an arbitrary shuffle.
11712 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11714 canonicalizeBroadcastableInput(V1, V1Mask);
11715 canonicalizeBroadcastableInput(V2, V2Mask);
11716 }
11717
11718 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11719 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11720 // the shuffle may be able to fold with a load or other benefit. However, when
11721 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11722 // pre-shuffle first is a better strategy.
11723 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11724 // Only prefer immediate blends to unpack/rotate.
11725 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11726 DAG, true))
11727 return BlendPerm;
11728 // If either input vector provides only a single element which is repeated
11729 // multiple times, unpacking from both input vectors would generate worse
11730 // code. e.g. for
11731 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11732 // it is better to process t4 first to create a vector of t4[0], then unpack
11733 // that vector with t2.
11734 if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
11736 if (SDValue UnpackPerm =
11737 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11738 return UnpackPerm;
11740 DL, VT, V1, V2, Mask, Subtarget, DAG))
11741 return RotatePerm;
11742 // Unpack/rotate failed - try again with variable blends.
11743 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11744 DAG))
11745 return BlendPerm;
11746 if (VT.getScalarSizeInBits() >= 32)
11747 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11748 DL, VT, V1, V2, Mask, Subtarget, DAG))
11749 return PermUnpack;
11750 }
11751
11752 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11753 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11754 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11755 // than half the elements coming from each source.
11756 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11757 V1Mask.assign(NumElts, -1);
11758 V2Mask.assign(NumElts, -1);
11759 FinalMask.assign(NumElts, -1);
11760 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11761 for (int j = 0; j != NumEltsPerLane; ++j) {
11762 int M = Mask[i + j];
11763 if (M >= 0 && M < NumElts) {
11764 V1Mask[i + (j / 2)] = M;
11765 FinalMask[i + j] = i + (j / 2);
11766 } else if (M >= NumElts) {
11767 V2Mask[i + (j / 2)] = M - NumElts;
11768 FinalMask[i + j] = i + (j / 2) + NumElts;
11769 }
11770 }
11771 }
11772
11773 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11774 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11775 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11776}
11777
11778static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11779 const X86Subtarget &Subtarget,
11780 ArrayRef<int> Mask) {
11781 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11782 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11783
11784 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11785 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11786 int MaxSubElts = 64 / EltSizeInBits;
11787 unsigned RotateAmt, NumSubElts;
11788 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11789 MaxSubElts, NumSubElts, RotateAmt))
11790 return -1;
11791 unsigned NumElts = Mask.size();
11792 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11793 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11794 return RotateAmt;
11795}
11796
11797/// Lower shuffle using X86ISD::VROTLI rotations.
11799 ArrayRef<int> Mask,
11800 const X86Subtarget &Subtarget,
11801 SelectionDAG &DAG) {
11802 // Only XOP + AVX512 targets have bit rotation instructions.
11803 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11804 bool IsLegal =
11805 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11806 if (!IsLegal && Subtarget.hasSSE3())
11807 return SDValue();
11808
11809 MVT RotateVT;
11810 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11811 Subtarget, Mask);
11812 if (RotateAmt < 0)
11813 return SDValue();
11814
11815 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11816 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11817 // widen to vXi16 or more then existing lowering should will be better.
11818 if (!IsLegal) {
11819 if ((RotateAmt % 16) == 0)
11820 return SDValue();
11821 // TODO: Use getTargetVShiftByConstNode.
11822 unsigned ShlAmt = RotateAmt;
11823 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11824 V1 = DAG.getBitcast(RotateVT, V1);
11825 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11826 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11827 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11828 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11829 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11830 return DAG.getBitcast(VT, Rot);
11831 }
11832
11833 SDValue Rot =
11834 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11835 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11836 return DAG.getBitcast(VT, Rot);
11837}
11838
11839/// Try to match a vector shuffle as an element rotation.
11840///
11841/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11843 ArrayRef<int> Mask) {
11844 int NumElts = Mask.size();
11845
11846 // We need to detect various ways of spelling a rotation:
11847 // [11, 12, 13, 14, 15, 0, 1, 2]
11848 // [-1, 12, 13, 14, -1, -1, 1, -1]
11849 // [-1, -1, -1, -1, -1, -1, 1, 2]
11850 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11851 // [-1, 4, 5, 6, -1, -1, 9, -1]
11852 // [-1, 4, 5, 6, -1, -1, -1, -1]
11853 int Rotation = 0;
11854 SDValue Lo, Hi;
11855 for (int i = 0; i < NumElts; ++i) {
11856 int M = Mask[i];
11857 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11858 "Unexpected mask index.");
11859 if (M < 0)
11860 continue;
11861
11862 // Determine where a rotated vector would have started.
11863 int StartIdx = i - (M % NumElts);
11864 if (StartIdx == 0)
11865 // The identity rotation isn't interesting, stop.
11866 return -1;
11867
11868 // If we found the tail of a vector the rotation must be the missing
11869 // front. If we found the head of a vector, it must be how much of the
11870 // head.
11871 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11872
11873 if (Rotation == 0)
11874 Rotation = CandidateRotation;
11875 else if (Rotation != CandidateRotation)
11876 // The rotations don't match, so we can't match this mask.
11877 return -1;
11878
11879 // Compute which value this mask is pointing at.
11880 SDValue MaskV = M < NumElts ? V1 : V2;
11881
11882 // Compute which of the two target values this index should be assigned
11883 // to. This reflects whether the high elements are remaining or the low
11884 // elements are remaining.
11885 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11886
11887 // Either set up this value if we've not encountered it before, or check
11888 // that it remains consistent.
11889 if (!TargetV)
11890 TargetV = MaskV;
11891 else if (TargetV != MaskV)
11892 // This may be a rotation, but it pulls from the inputs in some
11893 // unsupported interleaving.
11894 return -1;
11895 }
11896
11897 // Check that we successfully analyzed the mask, and normalize the results.
11898 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11899 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11900 if (!Lo)
11901 Lo = Hi;
11902 else if (!Hi)
11903 Hi = Lo;
11904
11905 V1 = Lo;
11906 V2 = Hi;
11907
11908 return Rotation;
11909}
11910
11911/// Try to lower a vector shuffle as a byte rotation.
11912///
11913/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11914/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11915/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11916/// try to generically lower a vector shuffle through such an pattern. It
11917/// does not check for the profitability of lowering either as PALIGNR or
11918/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11919/// This matches shuffle vectors that look like:
11920///
11921/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11922///
11923/// Essentially it concatenates V1 and V2, shifts right by some number of
11924/// elements, and takes the low elements as the result. Note that while this is
11925/// specified as a *right shift* because x86 is little-endian, it is a *left
11926/// rotate* of the vector lanes.
11928 ArrayRef<int> Mask) {
11929 // Don't accept any shuffles with zero elements.
11930 if (isAnyZero(Mask))
11931 return -1;
11932
11933 // PALIGNR works on 128-bit lanes.
11934 SmallVector<int, 16> RepeatedMask;
11935 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11936 return -1;
11937
11938 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11939 if (Rotation <= 0)
11940 return -1;
11941
11942 // PALIGNR rotates bytes, so we need to scale the
11943 // rotation based on how many bytes are in the vector lane.
11944 int NumElts = RepeatedMask.size();
11945 int Scale = 16 / NumElts;
11946 return Rotation * Scale;
11947}
11948
11950 SDValue V2, ArrayRef<int> Mask,
11951 const X86Subtarget &Subtarget,
11952 SelectionDAG &DAG) {
11953 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11954
11955 SDValue Lo = V1, Hi = V2;
11956 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11957 if (ByteRotation <= 0)
11958 return SDValue();
11959
11960 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11961 // PSLLDQ/PSRLDQ.
11962 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11963 Lo = DAG.getBitcast(ByteVT, Lo);
11964 Hi = DAG.getBitcast(ByteVT, Hi);
11965
11966 // SSSE3 targets can use the palignr instruction.
11967 if (Subtarget.hasSSSE3()) {
11968 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11969 "512-bit PALIGNR requires BWI instructions");
11970 return DAG.getBitcast(
11971 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11972 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11973 }
11974
11975 assert(VT.is128BitVector() &&
11976 "Rotate-based lowering only supports 128-bit lowering!");
11977 assert(Mask.size() <= 16 &&
11978 "Can shuffle at most 16 bytes in a 128-bit vector!");
11979 assert(ByteVT == MVT::v16i8 &&
11980 "SSE2 rotate lowering only needed for v16i8!");
11981
11982 // Default SSE2 implementation
11983 int LoByteShift = 16 - ByteRotation;
11984 int HiByteShift = ByteRotation;
11985
11986 SDValue LoShift =
11987 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
11988 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
11989 SDValue HiShift =
11990 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
11991 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
11992 return DAG.getBitcast(VT,
11993 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
11994}
11995
11996/// Try to lower a vector shuffle as a dword/qword rotation.
11997///
11998/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
11999/// rotation of the concatenation of two vectors; This routine will
12000/// try to generically lower a vector shuffle through such an pattern.
12001///
12002/// Essentially it concatenates V1 and V2, shifts right by some number of
12003/// elements, and takes the low elements as the result. Note that while this is
12004/// specified as a *right shift* because x86 is little-endian, it is a *left
12005/// rotate* of the vector lanes.
12007 SDValue V2, ArrayRef<int> Mask,
12008 const APInt &Zeroable,
12009 const X86Subtarget &Subtarget,
12010 SelectionDAG &DAG) {
12011 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
12012 "Only 32-bit and 64-bit elements are supported!");
12013
12014 // 128/256-bit vectors are only supported with VLX.
12015 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
12016 && "VLX required for 128/256-bit vectors");
12017
12018 SDValue Lo = V1, Hi = V2;
12019 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12020 if (0 < Rotation)
12021 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12022 DAG.getTargetConstant(Rotation, DL, MVT::i8));
12023
12024 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
12025 // TODO: Pull this out as a matchShuffleAsElementShift helper?
12026 // TODO: We can probably make this more aggressive and use shift-pairs like
12027 // lowerShuffleAsByteShiftMask.
12028 unsigned NumElts = Mask.size();
12029 unsigned ZeroLo = Zeroable.countr_one();
12030 unsigned ZeroHi = Zeroable.countl_one();
12031 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
12032 if (!ZeroLo && !ZeroHi)
12033 return SDValue();
12034
12035 if (ZeroLo) {
12036 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12037 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
12038 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
12039 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
12040 getZeroVector(VT, Subtarget, DAG, DL),
12041 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
12042 }
12043
12044 if (ZeroHi) {
12045 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
12046 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
12047 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
12048 return DAG.getNode(X86ISD::VALIGN, DL, VT,
12049 getZeroVector(VT, Subtarget, DAG, DL), Src,
12050 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
12051 }
12052
12053 return SDValue();
12054}
12055
12056/// Try to lower a vector shuffle as a byte shift sequence.
12058 SDValue V2, ArrayRef<int> Mask,
12059 const APInt &Zeroable,
12060 const X86Subtarget &Subtarget,
12061 SelectionDAG &DAG) {
12062 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12063 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
12064
12065 // We need a shuffle that has zeros at one/both ends and a sequential
12066 // shuffle from one source within.
12067 unsigned ZeroLo = Zeroable.countr_one();
12068 unsigned ZeroHi = Zeroable.countl_one();
12069 if (!ZeroLo && !ZeroHi)
12070 return SDValue();
12071
12072 unsigned NumElts = Mask.size();
12073 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12074 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12075 return SDValue();
12076
12077 unsigned Scale = VT.getScalarSizeInBits() / 8;
12078 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12079 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12080 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12081 return SDValue();
12082
12083 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12084 Res = DAG.getBitcast(MVT::v16i8, Res);
12085
12086 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12087 // inner sequential set of elements, possibly offset:
12088 // 01234567 --> zzzzzz01 --> 1zzzzzzz
12089 // 01234567 --> 4567zzzz --> zzzzz456
12090 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12091 if (ZeroLo == 0) {
12092 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12093 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12094 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12095 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12096 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12097 } else if (ZeroHi == 0) {
12098 unsigned Shift = Mask[ZeroLo] % NumElts;
12099 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12100 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12101 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12102 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12103 } else if (!Subtarget.hasSSSE3()) {
12104 // If we don't have PSHUFB then its worth avoiding an AND constant mask
12105 // by performing 3 byte shifts. Shuffle combining can kick in above that.
12106 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12107 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12108 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12109 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12110 Shift += Mask[ZeroLo] % NumElts;
12111 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12112 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12113 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12114 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12115 } else
12116 return SDValue();
12117
12118 return DAG.getBitcast(VT, Res);
12119}
12120
12121/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12122///
12123/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12124/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12125/// matches elements from one of the input vectors shuffled to the left or
12126/// right with zeroable elements 'shifted in'. It handles both the strictly
12127/// bit-wise element shifts and the byte shift across an entire 128-bit double
12128/// quad word lane.
12129///
12130/// PSHL : (little-endian) left bit shift.
12131/// [ zz, 0, zz, 2 ]
12132/// [ -1, 4, zz, -1 ]
12133/// PSRL : (little-endian) right bit shift.
12134/// [ 1, zz, 3, zz]
12135/// [ -1, -1, 7, zz]
12136/// PSLLDQ : (little-endian) left byte shift
12137/// [ zz, 0, 1, 2, 3, 4, 5, 6]
12138/// [ zz, zz, -1, -1, 2, 3, 4, -1]
12139/// [ zz, zz, zz, zz, zz, zz, -1, 1]
12140/// PSRLDQ : (little-endian) right byte shift
12141/// [ 5, 6, 7, zz, zz, zz, zz, zz]
12142/// [ -1, 5, 6, 7, zz, zz, zz, zz]
12143/// [ 1, 2, -1, -1, -1, -1, zz, zz]
12144static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12145 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12146 int MaskOffset, const APInt &Zeroable,
12147 const X86Subtarget &Subtarget) {
12148 int Size = Mask.size();
12149 unsigned SizeInBits = Size * ScalarSizeInBits;
12150
12151 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12152 for (int i = 0; i < Size; i += Scale)
12153 for (int j = 0; j < Shift; ++j)
12154 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12155 return false;
12156
12157 return true;
12158 };
12159
12160 auto MatchShift = [&](int Shift, int Scale, bool Left) {
12161 for (int i = 0; i != Size; i += Scale) {
12162 unsigned Pos = Left ? i + Shift : i;
12163 unsigned Low = Left ? i : i + Shift;
12164 unsigned Len = Scale - Shift;
12165 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12166 return -1;
12167 }
12168
12169 int ShiftEltBits = ScalarSizeInBits * Scale;
12170 bool ByteShift = ShiftEltBits > 64;
12171 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12172 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12173 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12174
12175 // Normalize the scale for byte shifts to still produce an i64 element
12176 // type.
12177 Scale = ByteShift ? Scale / 2 : Scale;
12178
12179 // We need to round trip through the appropriate type for the shift.
12180 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12181 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12182 : MVT::getVectorVT(ShiftSVT, Size / Scale);
12183 return (int)ShiftAmt;
12184 };
12185
12186 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12187 // keep doubling the size of the integer elements up to that. We can
12188 // then shift the elements of the integer vector by whole multiples of
12189 // their width within the elements of the larger integer vector. Test each
12190 // multiple to see if we can find a match with the moved element indices
12191 // and that the shifted in elements are all zeroable.
12192 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12193 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12194 for (int Shift = 1; Shift != Scale; ++Shift)
12195 for (bool Left : {true, false})
12196 if (CheckZeros(Shift, Scale, Left)) {
12197 int ShiftAmt = MatchShift(Shift, Scale, Left);
12198 if (0 < ShiftAmt)
12199 return ShiftAmt;
12200 }
12201
12202 // no match
12203 return -1;
12204}
12205
12207 SDValue V2, ArrayRef<int> Mask,
12208 const APInt &Zeroable,
12209 const X86Subtarget &Subtarget,
12210 SelectionDAG &DAG, bool BitwiseOnly) {
12211 int Size = Mask.size();
12212 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12213
12214 MVT ShiftVT;
12215 SDValue V = V1;
12216 unsigned Opcode;
12217
12218 // Try to match shuffle against V1 shift.
12219 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12220 Mask, 0, Zeroable, Subtarget);
12221
12222 // If V1 failed, try to match shuffle against V2 shift.
12223 if (ShiftAmt < 0) {
12224 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12225 Mask, Size, Zeroable, Subtarget);
12226 V = V2;
12227 }
12228
12229 if (ShiftAmt < 0)
12230 return SDValue();
12231
12232 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
12233 return SDValue();
12234
12235 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
12236 "Illegal integer vector type");
12237 V = DAG.getBitcast(ShiftVT, V);
12238 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12239 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12240 return DAG.getBitcast(VT, V);
12241}
12242
12243// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12244// Remainder of lower half result is zero and upper half is all undef.
12245static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12246 ArrayRef<int> Mask, uint64_t &BitLen,
12247 uint64_t &BitIdx, const APInt &Zeroable) {
12248 int Size = Mask.size();
12249 int HalfSize = Size / 2;
12250 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12251 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
12252
12253 // Upper half must be undefined.
12254 if (!isUndefUpperHalf(Mask))
12255 return false;
12256
12257 // Determine the extraction length from the part of the
12258 // lower half that isn't zeroable.
12259 int Len = HalfSize;
12260 for (; Len > 0; --Len)
12261 if (!Zeroable[Len - 1])
12262 break;
12263 assert(Len > 0 && "Zeroable shuffle mask");
12264
12265 // Attempt to match first Len sequential elements from the lower half.
12266 SDValue Src;
12267 int Idx = -1;
12268 for (int i = 0; i != Len; ++i) {
12269 int M = Mask[i];
12270 if (M == SM_SentinelUndef)
12271 continue;
12272 SDValue &V = (M < Size ? V1 : V2);
12273 M = M % Size;
12274
12275 // The extracted elements must start at a valid index and all mask
12276 // elements must be in the lower half.
12277 if (i > M || M >= HalfSize)
12278 return false;
12279
12280 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12281 Src = V;
12282 Idx = M - i;
12283 continue;
12284 }
12285 return false;
12286 }
12287
12288 if (!Src || Idx < 0)
12289 return false;
12290
12291 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12292 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12293 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12294 V1 = Src;
12295 return true;
12296}
12297
12298// INSERTQ: Extract lowest Len elements from lower half of second source and
12299// insert over first source, starting at Idx.
12300// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12301static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12302 ArrayRef<int> Mask, uint64_t &BitLen,
12303 uint64_t &BitIdx) {
12304 int Size = Mask.size();
12305 int HalfSize = Size / 2;
12306 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12307
12308 // Upper half must be undefined.
12309 if (!isUndefUpperHalf(Mask))
12310 return false;
12311
12312 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12313 SDValue Base;
12314
12315 // Attempt to match first source from mask before insertion point.
12316 if (isUndefInRange(Mask, 0, Idx)) {
12317 /* EMPTY */
12318 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12319 Base = V1;
12320 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12321 Base = V2;
12322 } else {
12323 continue;
12324 }
12325
12326 // Extend the extraction length looking to match both the insertion of
12327 // the second source and the remaining elements of the first.
12328 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12329 SDValue Insert;
12330 int Len = Hi - Idx;
12331
12332 // Match insertion.
12333 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12334 Insert = V1;
12335 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12336 Insert = V2;
12337 } else {
12338 continue;
12339 }
12340
12341 // Match the remaining elements of the lower half.
12342 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12343 /* EMPTY */
12344 } else if ((!Base || (Base == V1)) &&
12345 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12346 Base = V1;
12347 } else if ((!Base || (Base == V2)) &&
12348 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12349 Size + Hi)) {
12350 Base = V2;
12351 } else {
12352 continue;
12353 }
12354
12355 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12356 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12357 V1 = Base;
12358 V2 = Insert;
12359 return true;
12360 }
12361 }
12362
12363 return false;
12364}
12365
12366/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12368 SDValue V2, ArrayRef<int> Mask,
12369 const APInt &Zeroable, SelectionDAG &DAG) {
12370 uint64_t BitLen, BitIdx;
12371 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12372 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12373 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12374 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12375
12376 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12377 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12378 V2 ? V2 : DAG.getUNDEF(VT),
12379 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12380 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12381
12382 return SDValue();
12383}
12384
12385/// Lower a vector shuffle as an any/signed/zero extension.
12386///
12387/// Given a specific number of elements, element bit width, and extension
12388/// stride, produce either an extension based on the available
12389/// features of the subtarget. The extended elements are consecutive and
12390/// begin and can start from an offsetted element index in the input; to
12391/// avoid excess shuffling the offset must either being in the bottom lane
12392/// or at the start of a higher lane. All extended elements must be from
12393/// the same lane.
12395 int Scale, int Offset,
12396 unsigned ExtOpc, SDValue InputV,
12397 ArrayRef<int> Mask,
12398 const X86Subtarget &Subtarget,
12399 SelectionDAG &DAG) {
12400 assert(Scale > 1 && "Need a scale to extend.");
12401 assert(ISD::isExtOpcode(ExtOpc) && "Unsupported extension");
12402 int EltBits = VT.getScalarSizeInBits();
12403 int NumElements = VT.getVectorNumElements();
12404 int NumEltsPerLane = 128 / EltBits;
12405 int OffsetLane = Offset / NumEltsPerLane;
12406 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12407 "Only 8, 16, and 32 bit elements can be extended.");
12408 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
12409 assert(0 <= Offset && "Extension offset must be positive.");
12410 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
12411 "Extension offset must be in the first lane or start an upper lane.");
12412
12413 // Check that an index is in same lane as the base offset.
12414 auto SafeOffset = [&](int Idx) {
12415 return OffsetLane == (Idx / NumEltsPerLane);
12416 };
12417
12418 // Shift along an input so that the offset base moves to the first element.
12419 auto ShuffleOffset = [&](SDValue V) {
12420 if (!Offset)
12421 return V;
12422
12423 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12424 for (int i = 0; i * Scale < NumElements; ++i) {
12425 int SrcIdx = i + Offset;
12426 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12427 }
12428 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
12429 };
12430
12431 // Found a valid a/zext mask! Try various lowering strategies based on the
12432 // input type and available ISA extensions.
12433 if (Subtarget.hasSSE41()) {
12434 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12435 // PUNPCK will catch this in a later shuffle match.
12436 if (Offset && Scale == 2 && VT.is128BitVector())
12437 return SDValue();
12438 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
12439 NumElements / Scale);
12440 InputV = DAG.getBitcast(VT, InputV);
12441 InputV = ShuffleOffset(InputV);
12442 InputV = getEXTEND_VECTOR_INREG(ExtOpc, DL, ExtVT, InputV, DAG);
12443 return DAG.getBitcast(VT, InputV);
12444 }
12445
12446 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
12447 InputV = DAG.getBitcast(VT, InputV);
12448 bool AnyExt = ExtOpc == ISD::ANY_EXTEND;
12449
12450 // TODO: Add pre-SSE41 SIGN_EXTEND_VECTOR_INREG handling.
12451 if (ExtOpc == ISD::SIGN_EXTEND)
12452 return SDValue();
12453
12454 // For any extends we can cheat for larger element sizes and use shuffle
12455 // instructions that can fold with a load and/or copy.
12456 if (AnyExt && EltBits == 32) {
12457 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12458 -1};
12459 return DAG.getBitcast(
12460 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12461 DAG.getBitcast(MVT::v4i32, InputV),
12462 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12463 }
12464 if (AnyExt && EltBits == 16 && Scale > 2) {
12465 int PSHUFDMask[4] = {Offset / 2, -1,
12466 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12467 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12468 DAG.getBitcast(MVT::v4i32, InputV),
12469 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12470 int PSHUFWMask[4] = {1, -1, -1, -1};
12471 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
12472 return DAG.getBitcast(
12473 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
12474 DAG.getBitcast(MVT::v8i16, InputV),
12475 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
12476 }
12477
12478 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
12479 // to 64-bits.
12480 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12481 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
12482 assert(VT.is128BitVector() && "Unexpected vector width!");
12483
12484 int LoIdx = Offset * EltBits;
12485 SDValue Lo = DAG.getBitcast(
12486 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12487 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12488 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12489
12490 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12491 return DAG.getBitcast(VT, Lo);
12492
12493 int HiIdx = (Offset + 1) * EltBits;
12494 SDValue Hi = DAG.getBitcast(
12495 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12496 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12497 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
12498 return DAG.getBitcast(VT,
12499 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
12500 }
12501
12502 // If this would require more than 2 unpack instructions to expand, use
12503 // pshufb when available. We can only use more than 2 unpack instructions
12504 // when zero extending i8 elements which also makes it easier to use pshufb.
12505 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
12506 assert(NumElements == 16 && "Unexpected byte vector width!");
12507 SDValue PSHUFBMask[16];
12508 for (int i = 0; i < 16; ++i) {
12509 int Idx = Offset + (i / Scale);
12510 if ((i % Scale == 0 && SafeOffset(Idx))) {
12511 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
12512 continue;
12513 }
12514 PSHUFBMask[i] =
12515 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
12516 }
12517 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12518 return DAG.getBitcast(
12519 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12520 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12521 }
12522
12523 // If we are extending from an offset, ensure we start on a boundary that
12524 // we can unpack from.
12525 int AlignToUnpack = Offset % (NumElements / Scale);
12526 if (AlignToUnpack) {
12527 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12528 for (int i = AlignToUnpack; i < NumElements; ++i)
12529 ShMask[i - AlignToUnpack] = i;
12530 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12531 Offset -= AlignToUnpack;
12532 }
12533
12534 // Otherwise emit a sequence of unpacks.
12535 do {
12536 unsigned UnpackLoHi = X86ISD::UNPCKL;
12537 if (Offset >= (NumElements / 2)) {
12538 UnpackLoHi = X86ISD::UNPCKH;
12539 Offset -= (NumElements / 2);
12540 }
12541
12542 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12543 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12544 : getZeroVector(InputVT, Subtarget, DAG, DL);
12545 InputV = DAG.getBitcast(InputVT, InputV);
12546 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12547 Scale /= 2;
12548 EltBits *= 2;
12549 NumElements /= 2;
12550 } while (Scale > 1);
12551 return DAG.getBitcast(VT, InputV);
12552}
12553
12554/// Try to lower a vector shuffle as a zero extension on any microarch.
12555///
12556/// This routine will try to do everything in its power to cleverly lower
12557/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12558/// check for the profitability of this lowering, it tries to aggressively
12559/// match this pattern. It will use all of the micro-architectural details it
12560/// can to emit an efficient lowering. It handles both blends with all-zero
12561/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12562/// masking out later).
12563///
12564/// The reason we have dedicated lowering for zext-style shuffles is that they
12565/// are both incredibly common and often quite performance sensitive.
12567 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12568 const APInt &Zeroable, const X86Subtarget &Subtarget,
12569 SelectionDAG &DAG) {
12570 int Bits = VT.getSizeInBits();
12571 int NumLanes = Bits / 128;
12572 int NumElements = VT.getVectorNumElements();
12573 int NumEltsPerLane = NumElements / NumLanes;
12574 assert(VT.getScalarSizeInBits() <= 32 &&
12575 "Exceeds 32-bit integer zero extension limit");
12576 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12577
12578 // Define a helper function to check a particular ext-scale and lower to it if
12579 // valid.
12580 auto Lower = [&](int Scale) -> SDValue {
12581 SDValue InputV;
12582 bool AnyExt = true;
12583 int Offset = 0;
12584 int Matches = 0;
12585 for (int i = 0; i < NumElements; ++i) {
12586 int M = Mask[i];
12587 if (M < 0)
12588 continue; // Valid anywhere but doesn't tell us anything.
12589 if (i % Scale != 0) {
12590 // Each of the extended elements need to be zeroable.
12591 if (!Zeroable[i])
12592 return SDValue();
12593
12594 // We no longer are in the anyext case.
12595 AnyExt = false;
12596 continue;
12597 }
12598
12599 // Each of the base elements needs to be consecutive indices into the
12600 // same input vector.
12601 SDValue V = M < NumElements ? V1 : V2;
12602 M = M % NumElements;
12603 if (!InputV) {
12604 InputV = V;
12605 Offset = M - (i / Scale);
12606 } else if (InputV != V)
12607 return SDValue(); // Flip-flopping inputs.
12608
12609 // Offset must start in the lowest 128-bit lane or at the start of an
12610 // upper lane.
12611 // FIXME: Is it ever worth allowing a negative base offset?
12612 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12613 (Offset % NumEltsPerLane) == 0))
12614 return SDValue();
12615
12616 // If we are offsetting, all referenced entries must come from the same
12617 // lane.
12618 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12619 return SDValue();
12620
12621 if ((M % NumElements) != (Offset + (i / Scale)))
12622 return SDValue(); // Non-consecutive strided elements.
12623 Matches++;
12624 }
12625
12626 // If we fail to find an input, we have a zero-shuffle which should always
12627 // have already been handled.
12628 // FIXME: Maybe handle this here in case during blending we end up with one?
12629 if (!InputV)
12630 return SDValue();
12631
12632 // If we are offsetting, don't extend if we only match a single input, we
12633 // can always do better by using a basic PSHUF or PUNPCK.
12634 if (Offset != 0 && Matches < 2)
12635 return SDValue();
12636
12637 unsigned ExtOpc = AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND;
12638 return lowerShuffleAsSpecificExtension(DL, VT, Scale, Offset, ExtOpc,
12639 InputV, Mask, Subtarget, DAG);
12640 };
12641
12642 // The widest scale possible for extending is to a 64-bit integer.
12643 assert(Bits % 64 == 0 &&
12644 "The number of bits in a vector must be divisible by 64 on x86!");
12645 int NumExtElements = Bits / 64;
12646
12647 // Each iteration, try extending the elements half as much, but into twice as
12648 // many elements.
12649 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12650 assert(NumElements % NumExtElements == 0 &&
12651 "The input vector size must be divisible by the extended size.");
12652 if (SDValue V = Lower(NumElements / NumExtElements))
12653 return V;
12654 }
12655
12656 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12657 if (Bits != 128)
12658 return SDValue();
12659
12660 // Returns one of the source operands if the shuffle can be reduced to a
12661 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12662 auto CanZExtLowHalf = [&]() {
12663 for (int i = NumElements / 2; i != NumElements; ++i)
12664 if (!Zeroable[i])
12665 return SDValue();
12666 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12667 return V1;
12668 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12669 return V2;
12670 return SDValue();
12671 };
12672
12673 if (SDValue V = CanZExtLowHalf()) {
12674 V = DAG.getBitcast(MVT::v2i64, V);
12675 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12676 return DAG.getBitcast(VT, V);
12677 }
12678
12679 // No viable ext lowering found.
12680 return SDValue();
12681}
12682
12683/// Try to get a scalar value for a specific element of a vector.
12684///
12685/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12687 SelectionDAG &DAG) {
12688 MVT VT = V.getSimpleValueType();
12689 MVT EltVT = VT.getVectorElementType();
12690 V = peekThroughBitcasts(V);
12691
12692 // If the bitcasts shift the element size, we can't extract an equivalent
12693 // element from it.
12694 MVT NewVT = V.getSimpleValueType();
12695 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12696 return SDValue();
12697
12698 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12699 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12700 // Ensure the scalar operand is the same size as the destination.
12701 // FIXME: Add support for scalar truncation where possible.
12702 SDValue S = V.getOperand(Idx);
12703 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12704 return DAG.getBitcast(EltVT, S);
12705 }
12706
12707 return SDValue();
12708}
12709
12710/// Helper to test for a load that can be folded with x86 shuffles.
12711///
12712/// This is particularly important because the set of instructions varies
12713/// significantly based on whether the operand is a load or not.
12715 return V.hasOneUse() &&
12717}
12718
12719template<typename T>
12720static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12721 T EltVT = VT.getScalarType();
12722 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
12723 (EltVT == MVT::f16 && !Subtarget.hasFP16());
12724}
12725
12726/// Try to lower insertion of a single element into a zero vector.
12727///
12728/// This is a common pattern that we have especially efficient patterns to lower
12729/// across all subtarget feature sets.
12731 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12732 const APInt &Zeroable, const X86Subtarget &Subtarget,
12733 SelectionDAG &DAG) {
12734 MVT ExtVT = VT;
12735 MVT EltVT = VT.getVectorElementType();
12736 unsigned NumElts = VT.getVectorNumElements();
12737 unsigned EltBits = VT.getScalarSizeInBits();
12738
12739 if (isSoftF16(EltVT, Subtarget))
12740 return SDValue();
12741
12742 int V2Index =
12743 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12744 Mask.begin();
12745 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12746 bool IsV1Zeroable = true;
12747 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12748 if (i != V2Index && !Zeroable[i]) {
12749 IsV1Zeroable = false;
12750 break;
12751 }
12752
12753 // Bail if a non-zero V1 isn't used in place.
12754 if (!IsV1Zeroable) {
12755 SmallVector<int, 8> V1Mask(Mask);
12756 V1Mask[V2Index] = -1;
12757 if (!isNoopShuffleMask(V1Mask))
12758 return SDValue();
12759 }
12760
12761 // Check for a single input from a SCALAR_TO_VECTOR node.
12762 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12763 // all the smarts here sunk into that routine. However, the current
12764 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12765 // vector shuffle lowering is dead.
12766 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12767 DAG);
12768 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12769 // We need to zext the scalar if it is smaller than an i32.
12770 V2S = DAG.getBitcast(EltVT, V2S);
12771 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12772 // Using zext to expand a narrow element won't work for non-zero
12773 // insertions. But we can use a masked constant vector if we're
12774 // inserting V2 into the bottom of V1.
12775 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12776 return SDValue();
12777
12778 // Zero-extend directly to i32.
12779 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12780 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12781
12782 // If we're inserting into a constant, mask off the inserted index
12783 // and OR with the zero-extended scalar.
12784 if (!IsV1Zeroable) {
12785 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12786 Bits[V2Index] = APInt::getZero(EltBits);
12787 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12788 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12789 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12790 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12791 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12792 }
12793 }
12794 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12795 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12796 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
12797 // Either not inserting from the low element of the input or the input
12798 // element size is too small to use VZEXT_MOVL to clear the high bits.
12799 return SDValue();
12800 }
12801
12802 if (!IsV1Zeroable) {
12803 // If V1 can't be treated as a zero vector we have fewer options to lower
12804 // this. We can't support integer vectors or non-zero targets cheaply.
12805 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12806 if (!VT.isFloatingPoint() || V2Index != 0)
12807 return SDValue();
12808 if (!VT.is128BitVector())
12809 return SDValue();
12810
12811 // Otherwise, use MOVSD, MOVSS or MOVSH.
12812 unsigned MovOpc = 0;
12813 if (EltVT == MVT::f16)
12814 MovOpc = X86ISD::MOVSH;
12815 else if (EltVT == MVT::f32)
12816 MovOpc = X86ISD::MOVSS;
12817 else if (EltVT == MVT::f64)
12818 MovOpc = X86ISD::MOVSD;
12819 else
12820 llvm_unreachable("Unsupported floating point element type to handle!");
12821 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12822 }
12823
12824 // This lowering only works for the low element with floating point vectors.
12825 if (VT.isFloatingPoint() && V2Index != 0)
12826 return SDValue();
12827
12828 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12829 if (ExtVT != VT)
12830 V2 = DAG.getBitcast(VT, V2);
12831
12832 if (V2Index != 0) {
12833 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12834 // the desired position. Otherwise it is more efficient to do a vector
12835 // shift left. We know that we can do a vector shift left because all
12836 // the inputs are zero.
12837 if (VT.isFloatingPoint() || NumElts <= 4) {
12838 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12839 V2Shuffle[V2Index] = 0;
12840 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12841 } else {
12842 V2 = DAG.getBitcast(MVT::v16i8, V2);
12843 V2 = DAG.getNode(
12844 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12845 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12846 V2 = DAG.getBitcast(VT, V2);
12847 }
12848 }
12849 return V2;
12850}
12851
12852/// Try to lower broadcast of a single - truncated - integer element,
12853/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12854///
12855/// This assumes we have AVX2.
12857 int BroadcastIdx,
12858 const X86Subtarget &Subtarget,
12859 SelectionDAG &DAG) {
12860 assert(Subtarget.hasAVX2() &&
12861 "We can only lower integer broadcasts with AVX2!");
12862
12863 MVT EltVT = VT.getVectorElementType();
12864 MVT V0VT = V0.getSimpleValueType();
12865
12866 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12867 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12868
12869 MVT V0EltVT = V0VT.getVectorElementType();
12870 if (!V0EltVT.isInteger())
12871 return SDValue();
12872
12873 const unsigned EltSize = EltVT.getSizeInBits();
12874 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12875
12876 // This is only a truncation if the original element type is larger.
12877 if (V0EltSize <= EltSize)
12878 return SDValue();
12879
12880 assert(((V0EltSize % EltSize) == 0) &&
12881 "Scalar type sizes must all be powers of 2 on x86!");
12882
12883 const unsigned V0Opc = V0.getOpcode();
12884 const unsigned Scale = V0EltSize / EltSize;
12885 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12886
12887 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12888 V0Opc != ISD::BUILD_VECTOR)
12889 return SDValue();
12890
12891 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12892
12893 // If we're extracting non-least-significant bits, shift so we can truncate.
12894 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12895 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12896 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12897 if (const int OffsetIdx = BroadcastIdx % Scale)
12898 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12899 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12900
12901 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12902 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12903}
12904
12905/// Test whether this can be lowered with a single SHUFPS instruction.
12906///
12907/// This is used to disable more specialized lowerings when the shufps lowering
12908/// will happen to be efficient.
12910 // This routine only handles 128-bit shufps.
12911 assert(Mask.size() == 4 && "Unsupported mask size!");
12912 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12913 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12914 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12915 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12916
12917 // To lower with a single SHUFPS we need to have the low half and high half
12918 // each requiring a single input.
12919 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12920 return false;
12921 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12922 return false;
12923
12924 return true;
12925}
12926
12927/// Test whether the specified input (0 or 1) is in-place blended by the
12928/// given mask.
12929///
12930/// This returns true if the elements from a particular input are already in the
12931/// slot required by the given mask and require no permutation.
12933 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12934 int Size = Mask.size();
12935 for (int i = 0; i < Size; ++i)
12936 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12937 return false;
12938
12939 return true;
12940}
12941
12942/// Test whether the specified input (0 or 1) is a broadcast/splat blended by
12943/// the given mask.
12944///
12946 int BroadcastableElement = 0) {
12947 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12948 int Size = Mask.size();
12949 for (int i = 0; i < Size; ++i)
12950 if (Mask[i] >= 0 && Mask[i] / Size == Input &&
12951 Mask[i] % Size != BroadcastableElement)
12952 return false;
12953 return true;
12954}
12955
12956/// If we are extracting two 128-bit halves of a vector and shuffling the
12957/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12958/// multi-shuffle lowering.
12960 SDValue N1, ArrayRef<int> Mask,
12961 SelectionDAG &DAG) {
12962 MVT VT = N0.getSimpleValueType();
12963 assert((VT.is128BitVector() &&
12964 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12965 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12966
12967 // Check that both sources are extracts of the same source vector.
12968 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12970 N0.getOperand(0) != N1.getOperand(0) ||
12971 !N0.hasOneUse() || !N1.hasOneUse())
12972 return SDValue();
12973
12974 SDValue WideVec = N0.getOperand(0);
12975 MVT WideVT = WideVec.getSimpleValueType();
12976 if (!WideVT.is256BitVector())
12977 return SDValue();
12978
12979 // Match extracts of each half of the wide source vector. Commute the shuffle
12980 // if the extract of the low half is N1.
12981 unsigned NumElts = VT.getVectorNumElements();
12982 SmallVector<int, 4> NewMask(Mask);
12983 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
12984 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
12985 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
12987 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
12988 return SDValue();
12989
12990 // Final bailout: if the mask is simple, we are better off using an extract
12991 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
12992 // because that avoids a constant load from memory.
12993 if (NumElts == 4 &&
12994 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
12995 return SDValue();
12996
12997 // Extend the shuffle mask with undef elements.
12998 NewMask.append(NumElts, -1);
12999
13000 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13001 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13002 NewMask);
13003 // This is free: ymm -> xmm.
13004 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13005 DAG.getVectorIdxConstant(0, DL));
13006}
13007
13008/// Try to lower broadcast of a single element.
13009///
13010/// For convenience, this code also bundles all of the subtarget feature set
13011/// filtering. While a little annoying to re-dispatch on type here, there isn't
13012/// a convenient way to factor it out.
13014 SDValue V2, ArrayRef<int> Mask,
13015 const X86Subtarget &Subtarget,
13016 SelectionDAG &DAG) {
13017 MVT EltVT = VT.getVectorElementType();
13018 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13019 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13020 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
13021 return SDValue();
13022
13023 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13024 // we can only broadcast from a register with AVX2.
13025 unsigned NumEltBits = VT.getScalarSizeInBits();
13026 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13029 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13030
13031 // Check that the mask is a broadcast.
13032 int BroadcastIdx = getSplatIndex(Mask);
13033 if (BroadcastIdx < 0) {
13034 // Check for hidden broadcast.
13035 SmallVector<int, 16> BroadcastMask(VT.getVectorNumElements(), 0);
13036 if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2))
13037 return SDValue();
13038 BroadcastIdx = 0;
13039 }
13040 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
13041 "a sorted mask where the broadcast "
13042 "comes from V1.");
13043 int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
13044
13045 // Go up the chain of (vector) values to find a scalar load that we can
13046 // combine with the broadcast.
13047 // TODO: Combine this logic with findEltLoadSrc() used by
13048 // EltsFromConsecutiveLoads().
13049 int BitOffset = BroadcastIdx * NumEltBits;
13050 SDValue V = V1;
13051 for (;;) {
13052 switch (V.getOpcode()) {
13053 case ISD::BITCAST: {
13054 V = V.getOperand(0);
13055 continue;
13056 }
13057 case ISD::CONCAT_VECTORS: {
13058 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13059 int OpIdx = BitOffset / OpBitWidth;
13060 V = V.getOperand(OpIdx);
13061 BitOffset %= OpBitWidth;
13062 continue;
13063 }
13065 // The extraction index adds to the existing offset.
13066 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13067 unsigned Idx = V.getConstantOperandVal(1);
13068 unsigned BeginOffset = Idx * EltBitWidth;
13069 BitOffset += BeginOffset;
13070 V = V.getOperand(0);
13071 continue;
13072 }
13073 case ISD::INSERT_SUBVECTOR: {
13074 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13075 int EltBitWidth = VOuter.getScalarValueSizeInBits();
13076 int Idx = (int)V.getConstantOperandVal(2);
13077 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13078 int BeginOffset = Idx * EltBitWidth;
13079 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13080 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13081 BitOffset -= BeginOffset;
13082 V = VInner;
13083 } else {
13084 V = VOuter;
13085 }
13086 continue;
13087 }
13088 }
13089 break;
13090 }
13091 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
13092 BroadcastIdx = BitOffset / NumEltBits;
13093
13094 // Do we need to bitcast the source to retrieve the original broadcast index?
13095 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13096
13097 // Check if this is a broadcast of a scalar. We special case lowering
13098 // for scalars so that we can more effectively fold with loads.
13099 // If the original value has a larger element type than the shuffle, the
13100 // broadcast element is in essence truncated. Make that explicit to ease
13101 // folding.
13102 if (BitCastSrc && VT.isInteger())
13103 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13104 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13105 return TruncBroadcast;
13106
13107 // Also check the simpler case, where we can directly reuse the scalar.
13108 if (!BitCastSrc &&
13109 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13110 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13111 V = V.getOperand(BroadcastIdx);
13112
13113 // If we can't broadcast from a register, check that the input is a load.
13114 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13115 return SDValue();
13116 } else if (ISD::isNormalLoad(V.getNode()) &&
13117 cast<LoadSDNode>(V)->isSimple()) {
13118 // We do not check for one-use of the vector load because a broadcast load
13119 // is expected to be a win for code size, register pressure, and possibly
13120 // uops even if the original vector load is not eliminated.
13121
13122 // Reduce the vector load and shuffle to a broadcasted scalar load.
13123 auto *Ld = cast<LoadSDNode>(V);
13124 SDValue BaseAddr = Ld->getBasePtr();
13125 MVT SVT = VT.getScalarType();
13126 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13127 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
13128 SDValue NewAddr =
13130
13131 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13132 // than MOVDDUP.
13133 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13134 if (Opcode == X86ISD::VBROADCAST) {
13135 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13136 SDValue Ops[] = {Ld->getChain(), NewAddr};
13137 V = DAG.getMemIntrinsicNode(
13138 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13140 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13142 return DAG.getBitcast(VT, V);
13143 }
13144 assert(SVT == MVT::f64 && "Unexpected VT!");
13145 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13147 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13149 } else if (!BroadcastFromReg) {
13150 // We can't broadcast from a vector register.
13151 return SDValue();
13152 } else if (BitOffset != 0) {
13153 // We can only broadcast from the zero-element of a vector register,
13154 // but it can be advantageous to broadcast from the zero-element of a
13155 // subvector.
13156 if (!VT.is256BitVector() && !VT.is512BitVector())
13157 return SDValue();
13158
13159 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13160 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13161 return SDValue();
13162
13163 // If we are broadcasting an element from the lowest 128-bit subvector, try
13164 // to move the element in position.
13165 if (BitOffset < 128 && NumActiveElts > 1 &&
13166 V.getScalarValueSizeInBits() == NumEltBits) {
13167 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13168 "Unexpected bit-offset");
13169 SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
13170 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
13171 V = extractSubVector(V, 0, DAG, DL, 128);
13172 V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
13173 } else {
13174 // Only broadcast the zero-element of a 128-bit subvector.
13175 if ((BitOffset % 128) != 0)
13176 return SDValue();
13177
13178 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13179 "Unexpected bit-offset");
13180 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13181 "Unexpected vector size");
13182 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13183 V = extract128BitVector(V, ExtractIdx, DAG, DL);
13184 }
13185 }
13186
13187 // On AVX we can use VBROADCAST directly for scalar sources.
13188 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13189 V = DAG.getBitcast(MVT::f64, V);
13190 if (Subtarget.hasAVX()) {
13191 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13192 return DAG.getBitcast(VT, V);
13193 }
13194 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13195 }
13196
13197 // If this is a scalar, do the broadcast on this type and bitcast.
13198 if (!V.getValueType().isVector()) {
13199 assert(V.getScalarValueSizeInBits() == NumEltBits &&
13200 "Unexpected scalar size");
13201 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13203 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13204 }
13205
13206 // We only support broadcasting from 128-bit vectors to minimize the
13207 // number of patterns we need to deal with in isel. So extract down to
13208 // 128-bits, removing as many bitcasts as possible.
13209 if (V.getValueSizeInBits() > 128)
13211
13212 // Otherwise cast V to a vector with the same element type as VT, but
13213 // possibly narrower than VT. Then perform the broadcast.
13214 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13215 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13216 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13217}
13218
13219// Check for whether we can use INSERTPS to perform the shuffle. We only use
13220// INSERTPS when the V1 elements are already in the correct locations
13221// because otherwise we can just always use two SHUFPS instructions which
13222// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13223// perform INSERTPS if a single V1 element is out of place and all V2
13224// elements are zeroable.
13226 unsigned &InsertPSMask,
13227 const APInt &Zeroable,
13228 ArrayRef<int> Mask, SelectionDAG &DAG) {
13229 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
13230 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
13231 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13232
13233 // Attempt to match INSERTPS with one element from VA or VB being
13234 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13235 // are updated.
13236 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13237 ArrayRef<int> CandidateMask) {
13238 unsigned ZMask = 0;
13239 int VADstIndex = -1;
13240 int VBDstIndex = -1;
13241 bool VAUsedInPlace = false;
13242
13243 for (int i = 0; i < 4; ++i) {
13244 // Synthesize a zero mask from the zeroable elements (includes undefs).
13245 if (Zeroable[i]) {
13246 ZMask |= 1 << i;
13247 continue;
13248 }
13249
13250 // Flag if we use any VA inputs in place.
13251 if (i == CandidateMask[i]) {
13252 VAUsedInPlace = true;
13253 continue;
13254 }
13255
13256 // We can only insert a single non-zeroable element.
13257 if (VADstIndex >= 0 || VBDstIndex >= 0)
13258 return false;
13259
13260 if (CandidateMask[i] < 4) {
13261 // VA input out of place for insertion.
13262 VADstIndex = i;
13263 } else {
13264 // VB input for insertion.
13265 VBDstIndex = i;
13266 }
13267 }
13268
13269 // Don't bother if we have no (non-zeroable) element for insertion.
13270 if (VADstIndex < 0 && VBDstIndex < 0)
13271 return false;
13272
13273 // Determine element insertion src/dst indices. The src index is from the
13274 // start of the inserted vector, not the start of the concatenated vector.
13275 unsigned VBSrcIndex = 0;
13276 if (VADstIndex >= 0) {
13277 // If we have a VA input out of place, we use VA as the V2 element
13278 // insertion and don't use the original V2 at all.
13279 VBSrcIndex = CandidateMask[VADstIndex];
13280 VBDstIndex = VADstIndex;
13281 VB = VA;
13282 } else {
13283 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13284 }
13285
13286 // If no V1 inputs are used in place, then the result is created only from
13287 // the zero mask and the V2 insertion - so remove V1 dependency.
13288 if (!VAUsedInPlace)
13289 VA = DAG.getUNDEF(MVT::v4f32);
13290
13291 // Update V1, V2 and InsertPSMask accordingly.
13292 V1 = VA;
13293 V2 = VB;
13294
13295 // Insert the V2 element into the desired position.
13296 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13297 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13298 return true;
13299 };
13300
13301 if (matchAsInsertPS(V1, V2, Mask))
13302 return true;
13303
13304 // Commute and try again.
13305 SmallVector<int, 4> CommutedMask(Mask);
13307 if (matchAsInsertPS(V2, V1, CommutedMask))
13308 return true;
13309
13310 return false;
13311}
13312
13314 ArrayRef<int> Mask, const APInt &Zeroable,
13315 SelectionDAG &DAG) {
13316 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13317 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13318
13319 // Attempt to match the insertps pattern.
13320 unsigned InsertPSMask = 0;
13321 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13322 return SDValue();
13323
13324 // Insert the V2 element into the desired position.
13325 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13326 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13327}
13328
13329/// Handle lowering of 2-lane 64-bit floating point shuffles.
13330///
13331/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13332/// support for floating point shuffles but not integer shuffles. These
13333/// instructions will incur a domain crossing penalty on some chips though so
13334/// it is better to avoid lowering through this for integer vectors where
13335/// possible.
13337 const APInt &Zeroable, SDValue V1, SDValue V2,
13338 const X86Subtarget &Subtarget,
13339 SelectionDAG &DAG) {
13340 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13341 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13342 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13343
13344 if (V2.isUndef()) {
13345 // Check for being able to broadcast a single element.
13346 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13347 Mask, Subtarget, DAG))
13348 return Broadcast;
13349
13350 // Straight shuffle of a single input vector. Simulate this by using the
13351 // single input as both of the "inputs" to this instruction..
13352 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13353
13354 if (Subtarget.hasAVX()) {
13355 // If we have AVX, we can use VPERMILPS which will allow folding a load
13356 // into the shuffle.
13357 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13358 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13359 }
13360
13361 return DAG.getNode(
13362 X86ISD::SHUFP, DL, MVT::v2f64,
13363 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13364 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13365 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13366 }
13367 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13368 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13369 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13370 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13371
13372 if (Subtarget.hasAVX2())
13373 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13374 return Extract;
13375
13376 // When loading a scalar and then shuffling it into a vector we can often do
13377 // the insertion cheaply.
13379 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13380 return Insertion;
13381 // Try inverting the insertion since for v2 masks it is easy to do and we
13382 // can't reliably sort the mask one way or the other.
13383 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13384 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13386 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13387 return Insertion;
13388
13389 // Try to use one of the special instruction patterns to handle two common
13390 // blend patterns if a zero-blend above didn't work.
13391 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
13392 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
13393 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13394 // We can either use a special instruction to load over the low double or
13395 // to move just the low double.
13396 return DAG.getNode(
13397 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13398 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13399
13400 if (Subtarget.hasSSE41())
13401 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13402 Zeroable, Subtarget, DAG))
13403 return Blend;
13404
13405 // Use dedicated unpack instructions for masks that match their pattern.
13406 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG))
13407 return V;
13408
13409 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13410 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13411 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13412}
13413
13414/// Handle lowering of 2-lane 64-bit integer shuffles.
13415///
13416/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13417/// the integer unit to minimize domain crossing penalties. However, for blends
13418/// it falls back to the floating point shuffle operation with appropriate bit
13419/// casting.
13421 const APInt &Zeroable, SDValue V1, SDValue V2,
13422 const X86Subtarget &Subtarget,
13423 SelectionDAG &DAG) {
13424 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13425 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13426 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13427
13428 if (V2.isUndef()) {
13429 // Check for being able to broadcast a single element.
13430 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13431 Mask, Subtarget, DAG))
13432 return Broadcast;
13433
13434 // Straight shuffle of a single input vector. For everything from SSE2
13435 // onward this has a single fast instruction with no scary immediates.
13436 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13437 V1 = DAG.getBitcast(MVT::v4i32, V1);
13438 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13439 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13440 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13441 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13442 return DAG.getBitcast(
13443 MVT::v2i64,
13444 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13445 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13446 }
13447 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
13448 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
13449 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13450 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13451
13452 if (Subtarget.hasAVX2())
13453 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13454 return Extract;
13455
13456 // Try to use shift instructions.
13457 if (SDValue Shift =
13458 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
13459 DAG, /*BitwiseOnly*/ false))
13460 return Shift;
13461
13462 // When loading a scalar and then shuffling it into a vector we can often do
13463 // the insertion cheaply.
13465 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13466 return Insertion;
13467 // Try inverting the insertion since for v2 masks it is easy to do and we
13468 // can't reliably sort the mask one way or the other.
13469 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13471 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13472 return Insertion;
13473
13474 // We have different paths for blend lowering, but they all must use the
13475 // *exact* same predicate.
13476 bool IsBlendSupported = Subtarget.hasSSE41();
13477 if (IsBlendSupported)
13478 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
13479 Zeroable, Subtarget, DAG))
13480 return Blend;
13481
13482 // Use dedicated unpack instructions for masks that match their pattern.
13483 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG))
13484 return V;
13485
13486 // Try to use byte rotation instructions.
13487 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13488 if (Subtarget.hasSSSE3()) {
13489 if (Subtarget.hasVLX())
13490 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
13491 Zeroable, Subtarget, DAG))
13492 return Rotate;
13493
13494 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
13495 Subtarget, DAG))
13496 return Rotate;
13497 }
13498
13499 // If we have direct support for blends, we should lower by decomposing into
13500 // a permute. That will be faster than the domain cross.
13501 if (IsBlendSupported)
13502 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
13503 Zeroable, Subtarget, DAG);
13504
13505 // We implement this with SHUFPD which is pretty lame because it will likely
13506 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
13507 // However, all the alternatives are still more cycles and newer chips don't
13508 // have this problem. It would be really nice if x86 had better shuffles here.
13509 V1 = DAG.getBitcast(MVT::v2f64, V1);
13510 V2 = DAG.getBitcast(MVT::v2f64, V2);
13511 return DAG.getBitcast(MVT::v2i64,
13512 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
13513}
13514
13515/// Lower a vector shuffle using the SHUFPS instruction.
13516///
13517/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
13518/// It makes no assumptions about whether this is the *best* lowering, it simply
13519/// uses it.
13521 ArrayRef<int> Mask, SDValue V1,
13522 SDValue V2, SelectionDAG &DAG) {
13523 SDValue LowV = V1, HighV = V2;
13524 SmallVector<int, 4> NewMask(Mask);
13525 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13526
13527 if (NumV2Elements == 1) {
13528 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13529
13530 // Compute the index adjacent to V2Index and in the same half by toggling
13531 // the low bit.
13532 int V2AdjIndex = V2Index ^ 1;
13533
13534 if (Mask[V2AdjIndex] < 0) {
13535 // Handles all the cases where we have a single V2 element and an undef.
13536 // This will only ever happen in the high lanes because we commute the
13537 // vector otherwise.
13538 if (V2Index < 2)
13539 std::swap(LowV, HighV);
13540 NewMask[V2Index] -= 4;
13541 } else {
13542 // Handle the case where the V2 element ends up adjacent to a V1 element.
13543 // To make this work, blend them together as the first step.
13544 int V1Index = V2AdjIndex;
13545 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13546 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
13547 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13548
13549 // Now proceed to reconstruct the final blend as we have the necessary
13550 // high or low half formed.
13551 if (V2Index < 2) {
13552 LowV = V2;
13553 HighV = V1;
13554 } else {
13555 HighV = V2;
13556 }
13557 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13558 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13559 }
13560 } else if (NumV2Elements == 2) {
13561 if (Mask[0] < 4 && Mask[1] < 4) {
13562 // Handle the easy case where we have V1 in the low lanes and V2 in the
13563 // high lanes.
13564 NewMask[2] -= 4;
13565 NewMask[3] -= 4;
13566 } else if (Mask[2] < 4 && Mask[3] < 4) {
13567 // We also handle the reversed case because this utility may get called
13568 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13569 // arrange things in the right direction.
13570 NewMask[0] -= 4;
13571 NewMask[1] -= 4;
13572 HighV = V1;
13573 LowV = V2;
13574 } else {
13575 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13576 // trying to place elements directly, just blend them and set up the final
13577 // shuffle to place them.
13578
13579 // The first two blend mask elements are for V1, the second two are for
13580 // V2.
13581 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13582 Mask[2] < 4 ? Mask[2] : Mask[3],
13583 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13584 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13585 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13586 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13587
13588 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13589 // a blend.
13590 LowV = HighV = V1;
13591 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13592 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13593 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13594 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13595 }
13596 } else if (NumV2Elements == 3) {
13597 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13598 // we can get here due to other paths (e.g repeated mask matching) that we
13599 // don't want to do another round of lowerVECTOR_SHUFFLE.
13601 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13602 }
13603 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13604 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13605}
13606
13607/// Lower 4-lane 32-bit floating point shuffles.
13608///
13609/// Uses instructions exclusively from the floating point unit to minimize
13610/// domain crossing penalties, as these are sufficient to implement all v4f32
13611/// shuffles.
13613 const APInt &Zeroable, SDValue V1, SDValue V2,
13614 const X86Subtarget &Subtarget,
13615 SelectionDAG &DAG) {
13616 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13617 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13618 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13619
13620 if (Subtarget.hasSSE41())
13621 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13622 Zeroable, Subtarget, DAG))
13623 return Blend;
13624
13625 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13626
13627 if (NumV2Elements == 0) {
13628 // Check for being able to broadcast a single element.
13629 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13630 Mask, Subtarget, DAG))
13631 return Broadcast;
13632
13633 // Use even/odd duplicate instructions for masks that match their pattern.
13634 if (Subtarget.hasSSE3()) {
13635 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13636 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13637 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13638 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13639 }
13640
13641 if (Subtarget.hasAVX()) {
13642 // If we have AVX, we can use VPERMILPS which will allow folding a load
13643 // into the shuffle.
13644 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13645 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13646 }
13647
13648 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13649 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13650 if (!Subtarget.hasSSE2()) {
13651 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13652 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13653 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13654 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13655 }
13656
13657 // Otherwise, use a straight shuffle of a single input vector. We pass the
13658 // input vector to both operands to simulate this with a SHUFPS.
13659 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13660 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13661 }
13662
13663 if (Subtarget.hasSSE2())
13665 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13666 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13667 return ZExt;
13668 }
13669
13670 if (Subtarget.hasAVX2())
13671 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13672 return Extract;
13673
13674 // There are special ways we can lower some single-element blends. However, we
13675 // have custom ways we can lower more complex single-element blends below that
13676 // we defer to if both this and BLENDPS fail to match, so restrict this to
13677 // when the V2 input is targeting element 0 of the mask -- that is the fast
13678 // case here.
13679 if (NumV2Elements == 1 && Mask[0] >= 4)
13681 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13682 return V;
13683
13684 if (Subtarget.hasSSE41()) {
13685 // Use INSERTPS if we can complete the shuffle efficiently.
13686 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13687 return V;
13688
13689 if (!isSingleSHUFPSMask(Mask))
13690 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13691 V2, Mask, DAG))
13692 return BlendPerm;
13693 }
13694
13695 // Use low/high mov instructions. These are only valid in SSE1 because
13696 // otherwise they are widened to v2f64 and never get here.
13697 if (!Subtarget.hasSSE2()) {
13698 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13699 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13700 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13701 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13702 }
13703
13704 // Use dedicated unpack instructions for masks that match their pattern.
13705 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG))
13706 return V;
13707
13708 // Otherwise fall back to a SHUFPS lowering strategy.
13709 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13710}
13711
13712/// Lower 4-lane i32 vector shuffles.
13713///
13714/// We try to handle these with integer-domain shuffles where we can, but for
13715/// blends we use the floating point domain blend instructions.
13717 const APInt &Zeroable, SDValue V1, SDValue V2,
13718 const X86Subtarget &Subtarget,
13719 SelectionDAG &DAG) {
13720 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13721 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13722 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13723
13724 // Whenever we can lower this as a zext, that instruction is strictly faster
13725 // than any alternative. It also allows us to fold memory operands into the
13726 // shuffle in many cases.
13727 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13728 Zeroable, Subtarget, DAG))
13729 return ZExt;
13730
13731 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13732
13733 // Try to use shift instructions if fast.
13734 if (Subtarget.preferLowerShuffleAsShift()) {
13735 if (SDValue Shift =
13736 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13737 Subtarget, DAG, /*BitwiseOnly*/ true))
13738 return Shift;
13739 if (NumV2Elements == 0)
13740 if (SDValue Rotate =
13741 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13742 return Rotate;
13743 }
13744
13745 if (NumV2Elements == 0) {
13746 // Try to use broadcast unless the mask only has one non-undef element.
13747 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13748 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13749 Mask, Subtarget, DAG))
13750 return Broadcast;
13751 }
13752
13753 // Straight shuffle of a single input vector. For everything from SSE2
13754 // onward this has a single fast instruction with no scary immediates.
13755 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13756 // but we aren't actually going to use the UNPCK instruction because doing
13757 // so prevents folding a load into this instruction or making a copy.
13758 const int UnpackLoMask[] = {0, 0, 1, 1};
13759 const int UnpackHiMask[] = {2, 2, 3, 3};
13760 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
13761 Mask = UnpackLoMask;
13762 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
13763 Mask = UnpackHiMask;
13764
13765 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13766 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13767 }
13768
13769 if (Subtarget.hasAVX2())
13770 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13771 return Extract;
13772
13773 // Try to use shift instructions.
13774 if (SDValue Shift =
13775 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
13776 DAG, /*BitwiseOnly*/ false))
13777 return Shift;
13778
13779 // There are special ways we can lower some single-element blends.
13780 if (NumV2Elements == 1)
13782 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13783 return V;
13784
13785 // We have different paths for blend lowering, but they all must use the
13786 // *exact* same predicate.
13787 bool IsBlendSupported = Subtarget.hasSSE41();
13788 if (IsBlendSupported)
13789 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13790 Zeroable, Subtarget, DAG))
13791 return Blend;
13792
13793 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13794 Zeroable, Subtarget, DAG))
13795 return Masked;
13796
13797 // Use dedicated unpack instructions for masks that match their pattern.
13798 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG))
13799 return V;
13800
13801 // Try to use byte rotation instructions.
13802 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13803 if (Subtarget.hasSSSE3()) {
13804 if (Subtarget.hasVLX())
13805 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13806 Zeroable, Subtarget, DAG))
13807 return Rotate;
13808
13809 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13810 Subtarget, DAG))
13811 return Rotate;
13812 }
13813
13814 // Assume that a single SHUFPS is faster than an alternative sequence of
13815 // multiple instructions (even if the CPU has a domain penalty).
13816 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13817 if (!isSingleSHUFPSMask(Mask)) {
13818 // If we have direct support for blends, we should lower by decomposing into
13819 // a permute. That will be faster than the domain cross.
13820 if (IsBlendSupported)
13821 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13822 Zeroable, Subtarget, DAG);
13823
13824 // Try to lower by permuting the inputs into an unpack instruction.
13825 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13826 Mask, Subtarget, DAG))
13827 return Unpack;
13828 }
13829
13830 // We implement this with SHUFPS because it can blend from two vectors.
13831 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13832 // up the inputs, bypassing domain shift penalties that we would incur if we
13833 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13834 // relevant.
13835 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13836 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13837 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13838 return DAG.getBitcast(MVT::v4i32, ShufPS);
13839}
13840
13841/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13842/// shuffle lowering, and the most complex part.
13843///
13844/// The lowering strategy is to try to form pairs of input lanes which are
13845/// targeted at the same half of the final vector, and then use a dword shuffle
13846/// to place them onto the right half, and finally unpack the paired lanes into
13847/// their final position.
13848///
13849/// The exact breakdown of how to form these dword pairs and align them on the
13850/// correct sides is really tricky. See the comments within the function for
13851/// more of the details.
13852///
13853/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13854/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13855/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13856/// vector, form the analogous 128-bit 8-element Mask.
13858 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13859 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13860 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13861 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13862
13863 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13864 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13865 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13866
13867 // Attempt to directly match PSHUFLW or PSHUFHW.
13868 if (isUndefOrInRange(LoMask, 0, 4) &&
13869 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13870 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13871 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13872 }
13873 if (isUndefOrInRange(HiMask, 4, 8) &&
13874 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13875 for (int i = 0; i != 4; ++i)
13876 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13877 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13878 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13879 }
13880
13881 SmallVector<int, 4> LoInputs;
13882 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13883 array_pod_sort(LoInputs.begin(), LoInputs.end());
13884 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
13885 SmallVector<int, 4> HiInputs;
13886 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13887 array_pod_sort(HiInputs.begin(), HiInputs.end());
13888 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
13889 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13890 int NumHToL = LoInputs.size() - NumLToL;
13891 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13892 int NumHToH = HiInputs.size() - NumLToH;
13893 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13894 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13895 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13896 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13897
13898 // If we are shuffling values from one half - check how many different DWORD
13899 // pairs we need to create. If only 1 or 2 then we can perform this as a
13900 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13901 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13902 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13903 V = DAG.getNode(ShufWOp, DL, VT, V,
13904 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13905 V = DAG.getBitcast(PSHUFDVT, V);
13906 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13907 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13908 return DAG.getBitcast(VT, V);
13909 };
13910
13911 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13912 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13913 SmallVector<std::pair<int, int>, 4> DWordPairs;
13914 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13915
13916 // Collect the different DWORD pairs.
13917 for (int DWord = 0; DWord != 4; ++DWord) {
13918 int M0 = Mask[2 * DWord + 0];
13919 int M1 = Mask[2 * DWord + 1];
13920 M0 = (M0 >= 0 ? M0 % 4 : M0);
13921 M1 = (M1 >= 0 ? M1 % 4 : M1);
13922 if (M0 < 0 && M1 < 0)
13923 continue;
13924
13925 bool Match = false;
13926 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13927 auto &DWordPair = DWordPairs[j];
13928 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13929 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13930 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13931 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13932 PSHUFDMask[DWord] = DOffset + j;
13933 Match = true;
13934 break;
13935 }
13936 }
13937 if (!Match) {
13938 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13939 DWordPairs.push_back(std::make_pair(M0, M1));
13940 }
13941 }
13942
13943 if (DWordPairs.size() <= 2) {
13944 DWordPairs.resize(2, std::make_pair(-1, -1));
13945 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13946 DWordPairs[1].first, DWordPairs[1].second};
13947 // For splat, ensure we widen the PSHUFDMask to allow vXi64 folds.
13948 if (ShuffleVectorSDNode::isSplatMask(PSHUFDMask) &&
13949 ShuffleVectorSDNode::isSplatMask(PSHUFHalfMask)) {
13950 int SplatIdx = ShuffleVectorSDNode::getSplatMaskIndex(PSHUFHalfMask);
13951 std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
13952 PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
13953 PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
13954 }
13955 if ((NumHToL + NumHToH) == 0)
13956 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13957 if ((NumLToL + NumLToH) == 0)
13958 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13959 }
13960 }
13961
13962 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13963 // such inputs we can swap two of the dwords across the half mark and end up
13964 // with <=2 inputs to each half in each half. Once there, we can fall through
13965 // to the generic code below. For example:
13966 //
13967 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13968 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13969 //
13970 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13971 // and an existing 2-into-2 on the other half. In this case we may have to
13972 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13973 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13974 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
13975 // because any other situation (including a 3-into-1 or 1-into-3 in the other
13976 // half than the one we target for fixing) will be fixed when we re-enter this
13977 // path. We will also combine away any sequence of PSHUFD instructions that
13978 // result into a single instruction. Here is an example of the tricky case:
13979 //
13980 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13981 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
13982 //
13983 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
13984 //
13985 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
13986 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
13987 //
13988 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
13989 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
13990 //
13991 // The result is fine to be handled by the generic logic.
13992 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
13993 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
13994 int AOffset, int BOffset) {
13995 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
13996 "Must call this with A having 3 or 1 inputs from the A half.");
13997 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
13998 "Must call this with B having 1 or 3 inputs from the B half.");
13999 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
14000 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14001
14002 bool ThreeAInputs = AToAInputs.size() == 3;
14003
14004 // Compute the index of dword with only one word among the three inputs in
14005 // a half by taking the sum of the half with three inputs and subtracting
14006 // the sum of the actual three inputs. The difference is the remaining
14007 // slot.
14008 int ADWord = 0, BDWord = 0;
14009 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14010 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14011 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14012 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14013 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14014 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14015 int TripleNonInputIdx =
14016 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14017 TripleDWord = TripleNonInputIdx / 2;
14018
14019 // We use xor with one to compute the adjacent DWord to whichever one the
14020 // OneInput is in.
14021 OneInputDWord = (OneInput / 2) ^ 1;
14022
14023 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14024 // and BToA inputs. If there is also such a problem with the BToB and AToB
14025 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14026 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14027 // is essential that we don't *create* a 3<-1 as then we might oscillate.
14028 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14029 // Compute how many inputs will be flipped by swapping these DWords. We
14030 // need
14031 // to balance this to ensure we don't form a 3-1 shuffle in the other
14032 // half.
14033 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
14034 llvm::count(AToBInputs, 2 * ADWord + 1);
14035 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
14036 llvm::count(BToBInputs, 2 * BDWord + 1);
14037 if ((NumFlippedAToBInputs == 1 &&
14038 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14039 (NumFlippedBToBInputs == 1 &&
14040 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14041 // We choose whether to fix the A half or B half based on whether that
14042 // half has zero flipped inputs. At zero, we may not be able to fix it
14043 // with that half. We also bias towards fixing the B half because that
14044 // will more commonly be the high half, and we have to bias one way.
14045 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14046 ArrayRef<int> Inputs) {
14047 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14048 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14049 // Determine whether the free index is in the flipped dword or the
14050 // unflipped dword based on where the pinned index is. We use this bit
14051 // in an xor to conditionally select the adjacent dword.
14052 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14053 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14054 if (IsFixIdxInput == IsFixFreeIdxInput)
14055 FixFreeIdx += 1;
14056 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14057 assert(IsFixIdxInput != IsFixFreeIdxInput &&
14058 "We need to be changing the number of flipped inputs!");
14059 int PSHUFHalfMask[] = {0, 1, 2, 3};
14060 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14061 V = DAG.getNode(
14062 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14063 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14064 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14065
14066 for (int &M : Mask)
14067 if (M >= 0 && M == FixIdx)
14068 M = FixFreeIdx;
14069 else if (M >= 0 && M == FixFreeIdx)
14070 M = FixIdx;
14071 };
14072 if (NumFlippedBToBInputs != 0) {
14073 int BPinnedIdx =
14074 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14075 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14076 } else {
14077 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
14078 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14079 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14080 }
14081 }
14082 }
14083
14084 int PSHUFDMask[] = {0, 1, 2, 3};
14085 PSHUFDMask[ADWord] = BDWord;
14086 PSHUFDMask[BDWord] = ADWord;
14087 V = DAG.getBitcast(
14088 VT,
14089 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14090 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14091
14092 // Adjust the mask to match the new locations of A and B.
14093 for (int &M : Mask)
14094 if (M >= 0 && M/2 == ADWord)
14095 M = 2 * BDWord + M % 2;
14096 else if (M >= 0 && M/2 == BDWord)
14097 M = 2 * ADWord + M % 2;
14098
14099 // Recurse back into this routine to re-compute state now that this isn't
14100 // a 3 and 1 problem.
14101 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14102 };
14103 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14104 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14105 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14106 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14107
14108 // At this point there are at most two inputs to the low and high halves from
14109 // each half. That means the inputs can always be grouped into dwords and
14110 // those dwords can then be moved to the correct half with a dword shuffle.
14111 // We use at most one low and one high word shuffle to collect these paired
14112 // inputs into dwords, and finally a dword shuffle to place them.
14113 int PSHUFLMask[4] = {-1, -1, -1, -1};
14114 int PSHUFHMask[4] = {-1, -1, -1, -1};
14115 int PSHUFDMask[4] = {-1, -1, -1, -1};
14116
14117 // First fix the masks for all the inputs that are staying in their
14118 // original halves. This will then dictate the targets of the cross-half
14119 // shuffles.
14120 auto fixInPlaceInputs =
14121 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14122 MutableArrayRef<int> SourceHalfMask,
14123 MutableArrayRef<int> HalfMask, int HalfOffset) {
14124 if (InPlaceInputs.empty())
14125 return;
14126 if (InPlaceInputs.size() == 1) {
14127 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14128 InPlaceInputs[0] - HalfOffset;
14129 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14130 return;
14131 }
14132 if (IncomingInputs.empty()) {
14133 // Just fix all of the in place inputs.
14134 for (int Input : InPlaceInputs) {
14135 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14136 PSHUFDMask[Input / 2] = Input / 2;
14137 }
14138 return;
14139 }
14140
14141 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
14142 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14143 InPlaceInputs[0] - HalfOffset;
14144 // Put the second input next to the first so that they are packed into
14145 // a dword. We find the adjacent index by toggling the low bit.
14146 int AdjIndex = InPlaceInputs[0] ^ 1;
14147 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14148 llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);
14149 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14150 };
14151 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14152 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14153
14154 // Now gather the cross-half inputs and place them into a free dword of
14155 // their target half.
14156 // FIXME: This operation could almost certainly be simplified dramatically to
14157 // look more like the 3-1 fixing operation.
14158 auto moveInputsToRightHalf = [&PSHUFDMask](
14159 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14160 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14161 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14162 int DestOffset) {
14163 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14164 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14165 };
14166 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14167 int Word) {
14168 int LowWord = Word & ~1;
14169 int HighWord = Word | 1;
14170 return isWordClobbered(SourceHalfMask, LowWord) ||
14171 isWordClobbered(SourceHalfMask, HighWord);
14172 };
14173
14174 if (IncomingInputs.empty())
14175 return;
14176
14177 if (ExistingInputs.empty()) {
14178 // Map any dwords with inputs from them into the right half.
14179 for (int Input : IncomingInputs) {
14180 // If the source half mask maps over the inputs, turn those into
14181 // swaps and use the swapped lane.
14182 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14183 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14184 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14185 Input - SourceOffset;
14186 // We have to swap the uses in our half mask in one sweep.
14187 for (int &M : HalfMask)
14188 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14189 M = Input;
14190 else if (M == Input)
14191 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14192 } else {
14193 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
14194 Input - SourceOffset &&
14195 "Previous placement doesn't match!");
14196 }
14197 // Note that this correctly re-maps both when we do a swap and when
14198 // we observe the other side of the swap above. We rely on that to
14199 // avoid swapping the members of the input list directly.
14200 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14201 }
14202
14203 // Map the input's dword into the correct half.
14204 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14205 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14206 else
14207 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
14208 Input / 2 &&
14209 "Previous placement doesn't match!");
14210 }
14211
14212 // And just directly shift any other-half mask elements to be same-half
14213 // as we will have mirrored the dword containing the element into the
14214 // same position within that half.
14215 for (int &M : HalfMask)
14216 if (M >= SourceOffset && M < SourceOffset + 4) {
14217 M = M - SourceOffset + DestOffset;
14218 assert(M >= 0 && "This should never wrap below zero!");
14219 }
14220 return;
14221 }
14222
14223 // Ensure we have the input in a viable dword of its current half. This
14224 // is particularly tricky because the original position may be clobbered
14225 // by inputs being moved and *staying* in that half.
14226 if (IncomingInputs.size() == 1) {
14227 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14228 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14229 SourceOffset;
14230 SourceHalfMask[InputFixed - SourceOffset] =
14231 IncomingInputs[0] - SourceOffset;
14232 llvm::replace(HalfMask, IncomingInputs[0], InputFixed);
14233 IncomingInputs[0] = InputFixed;
14234 }
14235 } else if (IncomingInputs.size() == 2) {
14236 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14237 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14238 // We have two non-adjacent or clobbered inputs we need to extract from
14239 // the source half. To do this, we need to map them into some adjacent
14240 // dword slot in the source mask.
14241 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14242 IncomingInputs[1] - SourceOffset};
14243
14244 // If there is a free slot in the source half mask adjacent to one of
14245 // the inputs, place the other input in it. We use (Index XOR 1) to
14246 // compute an adjacent index.
14247 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14248 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14249 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14250 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14251 InputsFixed[1] = InputsFixed[0] ^ 1;
14252 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14253 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14254 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14255 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14256 InputsFixed[0] = InputsFixed[1] ^ 1;
14257 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14258 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14259 // The two inputs are in the same DWord but it is clobbered and the
14260 // adjacent DWord isn't used at all. Move both inputs to the free
14261 // slot.
14262 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14263 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14264 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14265 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14266 } else {
14267 // The only way we hit this point is if there is no clobbering
14268 // (because there are no off-half inputs to this half) and there is no
14269 // free slot adjacent to one of the inputs. In this case, we have to
14270 // swap an input with a non-input.
14271 for (int i = 0; i < 4; ++i)
14272 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14273 "We can't handle any clobbers here!");
14274 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14275 "Cannot have adjacent inputs here!");
14276
14277 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14278 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14279
14280 // We also have to update the final source mask in this case because
14281 // it may need to undo the above swap.
14282 for (int &M : FinalSourceHalfMask)
14283 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14284 M = InputsFixed[1] + SourceOffset;
14285 else if (M == InputsFixed[1] + SourceOffset)
14286 M = (InputsFixed[0] ^ 1) + SourceOffset;
14287
14288 InputsFixed[1] = InputsFixed[0] ^ 1;
14289 }
14290
14291 // Point everything at the fixed inputs.
14292 for (int &M : HalfMask)
14293 if (M == IncomingInputs[0])
14294 M = InputsFixed[0] + SourceOffset;
14295 else if (M == IncomingInputs[1])
14296 M = InputsFixed[1] + SourceOffset;
14297
14298 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14299 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14300 }
14301 } else {
14302 llvm_unreachable("Unhandled input size!");
14303 }
14304
14305 // Now hoist the DWord down to the right half.
14306 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14307 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14308 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14309 for (int &M : HalfMask)
14310 for (int Input : IncomingInputs)
14311 if (M == Input)
14312 M = FreeDWord * 2 + Input % 2;
14313 };
14314 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14315 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14316 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14317 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14318
14319 // Now enact all the shuffles we've computed to move the inputs into their
14320 // target half.
14321 if (!isNoopShuffleMask(PSHUFLMask))
14322 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14323 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14324 if (!isNoopShuffleMask(PSHUFHMask))
14325 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14326 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14327 if (!isNoopShuffleMask(PSHUFDMask))
14328 V = DAG.getBitcast(
14329 VT,
14330 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14331 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14332
14333 // At this point, each half should contain all its inputs, and we can then
14334 // just shuffle them into their final position.
14335 assert(none_of(LoMask, [](int M) { return M >= 4; }) &&
14336 "Failed to lift all the high half inputs to the low mask!");
14337 assert(none_of(HiMask, [](int M) { return M >= 0 && M < 4; }) &&
14338 "Failed to lift all the low half inputs to the high mask!");
14339
14340 // Do a half shuffle for the low mask.
14341 if (!isNoopShuffleMask(LoMask))
14342 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14343 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14344
14345 // Do a half shuffle with the high mask after shifting its values down.
14346 for (int &M : HiMask)
14347 if (M >= 0)
14348 M -= 4;
14349 if (!isNoopShuffleMask(HiMask))
14350 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14351 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14352
14353 return V;
14354}
14355
14356/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14357/// blend if only one input is used.
14359 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14360 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14362 "Lane crossing shuffle masks not supported");
14363
14364 int NumBytes = VT.getSizeInBits() / 8;
14365 int Size = Mask.size();
14366 int Scale = NumBytes / Size;
14367
14368 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14369 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14370 V1InUse = false;
14371 V2InUse = false;
14372
14373 for (int i = 0; i < NumBytes; ++i) {
14374 int M = Mask[i / Scale];
14375 if (M < 0)
14376 continue;
14377
14378 const int ZeroMask = 0x80;
14379 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14380 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14381 if (Zeroable[i / Scale])
14382 V1Idx = V2Idx = ZeroMask;
14383
14384 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14385 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14386 V1InUse |= (ZeroMask != V1Idx);
14387 V2InUse |= (ZeroMask != V2Idx);
14388 }
14389
14390 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14391 if (V1InUse)
14392 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14393 DAG.getBuildVector(ShufVT, DL, V1Mask));
14394 if (V2InUse)
14395 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14396 DAG.getBuildVector(ShufVT, DL, V2Mask));
14397
14398 // If we need shuffled inputs from both, blend the two.
14399 SDValue V;
14400 if (V1InUse && V2InUse)
14401 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14402 else
14403 V = V1InUse ? V1 : V2;
14404
14405 // Cast the result back to the correct type.
14406 return DAG.getBitcast(VT, V);
14407}
14408
14409/// Generic lowering of 8-lane i16 shuffles.
14410///
14411/// This handles both single-input shuffles and combined shuffle/blends with
14412/// two inputs. The single input shuffles are immediately delegated to
14413/// a dedicated lowering routine.
14414///
14415/// The blends are lowered in one of three fundamental ways. If there are few
14416/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14417/// of the input is significantly cheaper when lowered as an interleaving of
14418/// the two inputs, try to interleave them. Otherwise, blend the low and high
14419/// halves of the inputs separately (making them have relatively few inputs)
14420/// and then concatenate them.
14422 const APInt &Zeroable, SDValue V1, SDValue V2,
14423 const X86Subtarget &Subtarget,
14424 SelectionDAG &DAG) {
14425 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14426 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14427 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14428
14429 // Whenever we can lower this as a zext, that instruction is strictly faster
14430 // than any alternative.
14431 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
14432 Zeroable, Subtarget, DAG))
14433 return ZExt;
14434
14435 // Try to use lower using a truncation.
14436 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14437 Subtarget, DAG))
14438 return V;
14439
14440 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
14441
14442 if (NumV2Inputs == 0) {
14443 // Try to use shift instructions.
14444 if (SDValue Shift =
14445 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
14446 Subtarget, DAG, /*BitwiseOnly*/ false))
14447 return Shift;
14448
14449 // Check for being able to broadcast a single element.
14450 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
14451 Mask, Subtarget, DAG))
14452 return Broadcast;
14453
14454 // Try to use bit rotation instructions.
14455 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
14456 Subtarget, DAG))
14457 return Rotate;
14458
14459 // Use dedicated unpack instructions for masks that match their pattern.
14460 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14461 return V;
14462
14463 // Use dedicated pack instructions for masks that match their pattern.
14464 if (SDValue V =
14465 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14466 return V;
14467
14468 // Try to use byte rotation instructions.
14469 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
14470 Subtarget, DAG))
14471 return Rotate;
14472
14473 // Make a copy of the mask so it can be modified.
14474 SmallVector<int, 8> MutableMask(Mask);
14475 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
14476 Subtarget, DAG);
14477 }
14478
14479 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
14480 "All single-input shuffles should be canonicalized to be V1-input "
14481 "shuffles.");
14482
14483 // Try to use shift instructions.
14484 if (SDValue Shift =
14485 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
14486 DAG, /*BitwiseOnly*/ false))
14487 return Shift;
14488
14489 // See if we can use SSE4A Extraction / Insertion.
14490 if (Subtarget.hasSSE4A())
14491 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
14492 Zeroable, DAG))
14493 return V;
14494
14495 // There are special ways we can lower some single-element blends.
14496 if (NumV2Inputs == 1)
14498 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14499 return V;
14500
14501 // We have different paths for blend lowering, but they all must use the
14502 // *exact* same predicate.
14503 bool IsBlendSupported = Subtarget.hasSSE41();
14504 if (IsBlendSupported)
14505 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
14506 Zeroable, Subtarget, DAG))
14507 return Blend;
14508
14509 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
14510 Zeroable, Subtarget, DAG))
14511 return Masked;
14512
14513 // Use dedicated unpack instructions for masks that match their pattern.
14514 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14515 return V;
14516
14517 // Use dedicated pack instructions for masks that match their pattern.
14518 if (SDValue V =
14519 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14520 return V;
14521
14522 // Try to use lower using a truncation.
14523 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14524 Subtarget, DAG))
14525 return V;
14526
14527 // Try to use byte rotation instructions.
14528 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
14529 Subtarget, DAG))
14530 return Rotate;
14531
14532 if (SDValue BitBlend =
14533 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14534 return BitBlend;
14535
14536 // Try to use byte shift instructions to mask.
14537 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
14538 Zeroable, Subtarget, DAG))
14539 return V;
14540
14541 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
14542 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
14543 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
14544 !Subtarget.hasVLX()) {
14545 // Check if this is part of a 256-bit vector truncation.
14546 unsigned PackOpc = 0;
14547 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
14550 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
14551 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
14552 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
14553 DAG.getTargetConstant(0xEE, DL, MVT::i8));
14554 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
14555 V1 = extract128BitVector(V1V2, 0, DAG, DL);
14556 V2 = extract128BitVector(V1V2, 4, DAG, DL);
14557 PackOpc = X86ISD::PACKUS;
14558 } else if (Subtarget.hasSSE41()) {
14559 SmallVector<SDValue, 4> DWordClearOps(4,
14560 DAG.getConstant(0, DL, MVT::i32));
14561 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14562 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
14563 SDValue DWordClearMask =
14564 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
14565 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
14566 DWordClearMask);
14567 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
14568 DWordClearMask);
14569 PackOpc = X86ISD::PACKUS;
14570 } else if (!Subtarget.hasSSSE3()) {
14571 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
14572 V1 = DAG.getBitcast(MVT::v4i32, V1);
14573 V2 = DAG.getBitcast(MVT::v4i32, V2);
14574 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
14575 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
14576 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
14577 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14578 PackOpc = X86ISD::PACKSS;
14579 }
14580 if (PackOpc) {
14581 // Now pack things back together.
14582 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14583 if (NumEvenDrops == 2) {
14584 Result = DAG.getBitcast(MVT::v4i32, Result);
14585 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14586 }
14587 return Result;
14588 }
14589 }
14590
14591 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14592 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14593 if (NumOddDrops == 1) {
14594 bool HasSSE41 = Subtarget.hasSSE41();
14595 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14596 DAG.getBitcast(MVT::v4i32, V1),
14597 DAG.getTargetConstant(16, DL, MVT::i8));
14598 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14599 DAG.getBitcast(MVT::v4i32, V2),
14600 DAG.getTargetConstant(16, DL, MVT::i8));
14601 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14602 MVT::v8i16, V1, V2);
14603 }
14604
14605 // Try to lower by permuting the inputs into an unpack instruction.
14606 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14607 Mask, Subtarget, DAG))
14608 return Unpack;
14609
14610 // If we can't directly blend but can use PSHUFB, that will be better as it
14611 // can both shuffle and set up the inefficient blend.
14612 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14613 bool V1InUse, V2InUse;
14614 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14615 Zeroable, DAG, V1InUse, V2InUse);
14616 }
14617
14618 // We can always bit-blend if we have to so the fallback strategy is to
14619 // decompose into single-input permutes and blends/unpacks.
14620 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
14621 Zeroable, Subtarget, DAG);
14622}
14623
14624/// Lower 8-lane 16-bit floating point shuffles.
14626 const APInt &Zeroable, SDValue V1, SDValue V2,
14627 const X86Subtarget &Subtarget,
14628 SelectionDAG &DAG) {
14629 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14630 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14631 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14632 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14633
14634 if (Subtarget.hasFP16()) {
14635 if (NumV2Elements == 0) {
14636 // Check for being able to broadcast a single element.
14637 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14638 Mask, Subtarget, DAG))
14639 return Broadcast;
14640 }
14641 if (NumV2Elements == 1 && Mask[0] >= 8)
14643 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14644 return V;
14645 }
14646
14647 V1 = DAG.getBitcast(MVT::v8i16, V1);
14648 V2 = DAG.getBitcast(MVT::v8i16, V2);
14649 return DAG.getBitcast(MVT::v8f16,
14650 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14651}
14652
14653// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14654// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14655// the active subvector is extracted.
14657 ArrayRef<int> OriginalMask, SDValue V1,
14658 SDValue V2, const X86Subtarget &Subtarget,
14659 SelectionDAG &DAG) {
14660 // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
14661 SmallVector<int, 32> Mask(OriginalMask);
14662 if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
14663 !isShuffleFoldableLoad(V2)) {
14665 std::swap(V1, V2);
14666 }
14667
14668 MVT MaskVT = VT.changeTypeToInteger();
14669 SDValue MaskNode;
14670 MVT ShuffleVT = VT;
14671 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14672 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14673 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14674 ShuffleVT = V1.getSimpleValueType();
14675
14676 // Adjust mask to correct indices for the second input.
14677 int NumElts = VT.getVectorNumElements();
14678 unsigned Scale = 512 / VT.getSizeInBits();
14679 SmallVector<int, 32> AdjustedMask(Mask);
14680 for (int &M : AdjustedMask)
14681 if (NumElts <= M)
14682 M += (Scale - 1) * NumElts;
14683 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14684 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14685 } else {
14686 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14687 }
14688
14689 SDValue Result;
14690 if (V2.isUndef())
14691 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14692 else
14693 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14694
14695 if (VT != ShuffleVT)
14696 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14697
14698 return Result;
14699}
14700
14701/// Generic lowering of v16i8 shuffles.
14702///
14703/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14704/// detect any complexity reducing interleaving. If that doesn't help, it uses
14705/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14706/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14707/// back together.
14709 const APInt &Zeroable, SDValue V1, SDValue V2,
14710 const X86Subtarget &Subtarget,
14711 SelectionDAG &DAG) {
14712 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14713 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14714 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14715
14716 // Try to use shift instructions.
14717 if (SDValue Shift =
14718 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14719 DAG, /*BitwiseOnly*/ false))
14720 return Shift;
14721
14722 // Try to use byte rotation instructions.
14723 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14724 Subtarget, DAG))
14725 return Rotate;
14726
14727 // Use dedicated pack instructions for masks that match their pattern.
14728 if (SDValue V =
14729 lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14730 return V;
14731
14732 // Try to use a zext lowering.
14733 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14734 Zeroable, Subtarget, DAG))
14735 return ZExt;
14736
14737 // Try to use lower using a truncation.
14738 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14739 Subtarget, DAG))
14740 return V;
14741
14742 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14743 Subtarget, DAG))
14744 return V;
14745
14746 // See if we can use SSE4A Extraction / Insertion.
14747 if (Subtarget.hasSSE4A())
14748 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14749 Zeroable, DAG))
14750 return V;
14751
14752 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14753
14754 // For single-input shuffles, there are some nicer lowering tricks we can use.
14755 if (NumV2Elements == 0) {
14756 // Check for being able to broadcast a single element.
14757 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14758 Mask, Subtarget, DAG))
14759 return Broadcast;
14760
14761 // Try to use bit rotation instructions.
14762 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14763 Subtarget, DAG))
14764 return Rotate;
14765
14766 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14767 return V;
14768
14769 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14770 // Notably, this handles splat and partial-splat shuffles more efficiently.
14771 // However, it only makes sense if the pre-duplication shuffle simplifies
14772 // things significantly. Currently, this means we need to be able to
14773 // express the pre-duplication shuffle as an i16 shuffle.
14774 //
14775 // FIXME: We should check for other patterns which can be widened into an
14776 // i16 shuffle as well.
14777 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14778 for (int i = 0; i < 16; i += 2)
14779 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14780 return false;
14781
14782 return true;
14783 };
14784 auto tryToWidenViaDuplication = [&]() -> SDValue {
14785 if (!canWidenViaDuplication(Mask))
14786 return SDValue();
14787 SmallVector<int, 4> LoInputs;
14788 copy_if(Mask, std::back_inserter(LoInputs),
14789 [](int M) { return M >= 0 && M < 8; });
14790 array_pod_sort(LoInputs.begin(), LoInputs.end());
14791 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14792 SmallVector<int, 4> HiInputs;
14793 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14794 array_pod_sort(HiInputs.begin(), HiInputs.end());
14795 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14796
14797 bool TargetLo = LoInputs.size() >= HiInputs.size();
14798 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14799 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14800
14801 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14803 for (int I : InPlaceInputs) {
14804 PreDupI16Shuffle[I/2] = I/2;
14805 LaneMap[I] = I;
14806 }
14807 int j = TargetLo ? 0 : 4, je = j + 4;
14808 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14809 // Check if j is already a shuffle of this input. This happens when
14810 // there are two adjacent bytes after we move the low one.
14811 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14812 // If we haven't yet mapped the input, search for a slot into which
14813 // we can map it.
14814 while (j < je && PreDupI16Shuffle[j] >= 0)
14815 ++j;
14816
14817 if (j == je)
14818 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14819 return SDValue();
14820
14821 // Map this input with the i16 shuffle.
14822 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14823 }
14824
14825 // Update the lane map based on the mapping we ended up with.
14826 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14827 }
14828 V1 = DAG.getBitcast(
14829 MVT::v16i8,
14830 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14831 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14832
14833 // Unpack the bytes to form the i16s that will be shuffled into place.
14834 bool EvenInUse = false, OddInUse = false;
14835 for (int i = 0; i < 16; i += 2) {
14836 EvenInUse |= (Mask[i + 0] >= 0);
14837 OddInUse |= (Mask[i + 1] >= 0);
14838 if (EvenInUse && OddInUse)
14839 break;
14840 }
14841 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14842 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14843 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14844
14845 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14846 for (int i = 0; i < 16; ++i)
14847 if (Mask[i] >= 0) {
14848 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14849 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14850 if (PostDupI16Shuffle[i / 2] < 0)
14851 PostDupI16Shuffle[i / 2] = MappedMask;
14852 else
14853 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14854 "Conflicting entries in the original shuffle!");
14855 }
14856 return DAG.getBitcast(
14857 MVT::v16i8,
14858 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14859 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14860 };
14861 if (SDValue V = tryToWidenViaDuplication())
14862 return V;
14863 }
14864
14865 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14866 Zeroable, Subtarget, DAG))
14867 return Masked;
14868
14869 // Use dedicated unpack instructions for masks that match their pattern.
14870 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14871 return V;
14872
14873 // Try to use byte shift instructions to mask.
14874 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14875 Zeroable, Subtarget, DAG))
14876 return V;
14877
14878 // Check for compaction patterns.
14879 bool IsSingleInput = V2.isUndef();
14880 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14881
14882 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14883 // with PSHUFB. It is important to do this before we attempt to generate any
14884 // blends but after all of the single-input lowerings. If the single input
14885 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14886 // want to preserve that and we can DAG combine any longer sequences into
14887 // a PSHUFB in the end. But once we start blending from multiple inputs,
14888 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14889 // and there are *very* few patterns that would actually be faster than the
14890 // PSHUFB approach because of its ability to zero lanes.
14891 //
14892 // If the mask is a binary compaction, we can more efficiently perform this
14893 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14894 //
14895 // FIXME: The only exceptions to the above are blends which are exact
14896 // interleavings with direct instructions supporting them. We currently don't
14897 // handle those well here.
14898 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14899 bool V1InUse = false;
14900 bool V2InUse = false;
14901
14903 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14904
14905 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14906 // do so. This avoids using them to handle blends-with-zero which is
14907 // important as a single pshufb is significantly faster for that.
14908 if (V1InUse && V2InUse) {
14909 if (Subtarget.hasSSE41())
14910 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14911 Zeroable, Subtarget, DAG))
14912 return Blend;
14913
14914 // We can use an unpack to do the blending rather than an or in some
14915 // cases. Even though the or may be (very minorly) more efficient, we
14916 // preference this lowering because there are common cases where part of
14917 // the complexity of the shuffles goes away when we do the final blend as
14918 // an unpack.
14919 // FIXME: It might be worth trying to detect if the unpack-feeding
14920 // shuffles will both be pshufb, in which case we shouldn't bother with
14921 // this.
14923 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14924 return Unpack;
14925
14926 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14927 if (Subtarget.hasVBMI())
14928 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14929 DAG);
14930
14931 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14932 if (Subtarget.hasXOP()) {
14933 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14934 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14935 }
14936
14937 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14938 // PALIGNR will be cheaper than the second PSHUFB+OR.
14940 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14941 return V;
14942 }
14943
14944 return PSHUFB;
14945 }
14946
14947 // There are special ways we can lower some single-element blends.
14948 if (NumV2Elements == 1)
14950 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14951 return V;
14952
14953 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14954 return Blend;
14955
14956 // Check whether a compaction lowering can be done. This handles shuffles
14957 // which take every Nth element for some even N. See the helper function for
14958 // details.
14959 //
14960 // We special case these as they can be particularly efficiently handled with
14961 // the PACKUSB instruction on x86 and they show up in common patterns of
14962 // rearranging bytes to truncate wide elements.
14963 if (NumEvenDrops) {
14964 // NumEvenDrops is the power of two stride of the elements. Another way of
14965 // thinking about it is that we need to drop the even elements this many
14966 // times to get the original input.
14967
14968 // First we need to zero all the dropped bytes.
14969 assert(NumEvenDrops <= 3 &&
14970 "No support for dropping even elements more than 3 times.");
14971 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
14972 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14973 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
14974 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
14975 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
14976 WordClearMask);
14977 if (!IsSingleInput)
14978 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
14979 WordClearMask);
14980
14981 // Now pack things back together.
14982 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14983 IsSingleInput ? V1 : V2);
14984 for (int i = 1; i < NumEvenDrops; ++i) {
14985 Result = DAG.getBitcast(MVT::v8i16, Result);
14986 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
14987 }
14988 return Result;
14989 }
14990
14991 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
14992 if (NumOddDrops == 1) {
14993 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14994 DAG.getBitcast(MVT::v8i16, V1),
14995 DAG.getTargetConstant(8, DL, MVT::i8));
14996 if (!IsSingleInput)
14997 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14998 DAG.getBitcast(MVT::v8i16, V2),
14999 DAG.getTargetConstant(8, DL, MVT::i8));
15000 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15001 IsSingleInput ? V1 : V2);
15002 }
15003
15004 // Handle multi-input cases by blending/unpacking single-input shuffles.
15005 if (NumV2Elements > 0)
15006 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15007 Zeroable, Subtarget, DAG);
15008
15009 // The fallback path for single-input shuffles widens this into two v8i16
15010 // vectors with unpacks, shuffles those, and then pulls them back together
15011 // with a pack.
15012 SDValue V = V1;
15013
15014 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15015 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15016 for (int i = 0; i < 16; ++i)
15017 if (Mask[i] >= 0)
15018 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15019
15020 SDValue VLoHalf, VHiHalf;
15021 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15022 // them out and avoid using UNPCK{L,H} to extract the elements of V as
15023 // i16s.
15024 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15025 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15026 // Use a mask to drop the high bytes.
15027 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15028 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15029 DAG.getConstant(0x00FF, DL, MVT::v8i16));
15030
15031 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15032 VHiHalf = DAG.getUNDEF(MVT::v8i16);
15033
15034 // Squash the masks to point directly into VLoHalf.
15035 for (int &M : LoBlendMask)
15036 if (M >= 0)
15037 M /= 2;
15038 for (int &M : HiBlendMask)
15039 if (M >= 0)
15040 M /= 2;
15041 } else {
15042 // Otherwise just unpack the low half of V into VLoHalf and the high half into
15043 // VHiHalf so that we can blend them as i16s.
15044 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15045
15046 VLoHalf = DAG.getBitcast(
15047 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15048 VHiHalf = DAG.getBitcast(
15049 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15050 }
15051
15052 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15053 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15054
15055 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15056}
15057
15058/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15059///
15060/// This routine breaks down the specific type of 128-bit shuffle and
15061/// dispatches to the lowering routines accordingly.
15063 MVT VT, SDValue V1, SDValue V2,
15064 const APInt &Zeroable,
15065 const X86Subtarget &Subtarget,
15066 SelectionDAG &DAG) {
15067 if (VT == MVT::v8bf16) {
15068 V1 = DAG.getBitcast(MVT::v8i16, V1);
15069 V2 = DAG.getBitcast(MVT::v8i16, V2);
15070 return DAG.getBitcast(VT,
15071 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15072 }
15073
15074 switch (VT.SimpleTy) {
15075 case MVT::v2i64:
15076 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15077 case MVT::v2f64:
15078 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15079 case MVT::v4i32:
15080 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15081 case MVT::v4f32:
15082 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15083 case MVT::v8i16:
15084 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15085 case MVT::v8f16:
15086 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15087 case MVT::v16i8:
15088 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15089
15090 default:
15091 llvm_unreachable("Unimplemented!");
15092 }
15093}
15094
15095/// Generic routine to split vector shuffle into half-sized shuffles.
15096///
15097/// This routine just extracts two subvectors, shuffles them independently, and
15098/// then concatenates them back together. This should work effectively with all
15099/// AVX vector shuffle types.
15101 SDValue V2, ArrayRef<int> Mask,
15102 SelectionDAG &DAG, bool SimpleOnly) {
15103 assert(VT.getSizeInBits() >= 256 &&
15104 "Only for 256-bit or wider vector shuffles!");
15105 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
15106 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
15107
15108 // If this came from the AVX1 v8i32 -> v8f32 bitcast, split using v4i32.
15109 if (VT == MVT::v8f32) {
15110 SDValue BC1 = peekThroughBitcasts(V1);
15111 SDValue BC2 = peekThroughBitcasts(V2);
15112 if (BC1.getValueType() == MVT::v8i32 && BC2.getValueType() == MVT::v8i32) {
15113 if (SDValue Split = splitAndLowerShuffle(DL, MVT::v8i32, BC1, BC2, Mask,
15114 DAG, SimpleOnly))
15115 return DAG.getBitcast(VT, Split);
15116 }
15117 }
15118
15119 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15120 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15121
15122 int NumElements = VT.getVectorNumElements();
15123 int SplitNumElements = NumElements / 2;
15124 MVT ScalarVT = VT.getVectorElementType();
15125 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15126
15127 // Use splitVector/extractSubVector so that split build-vectors just build two
15128 // narrower build vectors. This helps shuffling with splats and zeros.
15129 auto SplitVector = [&](SDValue V) {
15130 SDValue LoV, HiV;
15131 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15132 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15133 DAG.getBitcast(SplitVT, HiV));
15134 };
15135
15136 SDValue LoV1, HiV1, LoV2, HiV2;
15137 std::tie(LoV1, HiV1) = SplitVector(V1);
15138 std::tie(LoV2, HiV2) = SplitVector(V2);
15139
15140 // Now create two 4-way blends of these half-width vectors.
15141 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
15142 bool &UseHiV1, bool &UseLoV2,
15143 bool &UseHiV2) {
15144 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
15145 for (int i = 0; i < SplitNumElements; ++i) {
15146 int M = HalfMask[i];
15147 if (M >= NumElements) {
15148 if (M >= NumElements + SplitNumElements)
15149 UseHiV2 = true;
15150 else
15151 UseLoV2 = true;
15152 } else if (M >= 0) {
15153 if (M >= SplitNumElements)
15154 UseHiV1 = true;
15155 else
15156 UseLoV1 = true;
15157 }
15158 }
15159 };
15160
15161 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
15162 if (!SimpleOnly)
15163 return true;
15164
15165 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15166 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15167
15168 return !(UseHiV1 || UseHiV2);
15169 };
15170
15171 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15172 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15173 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15174 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15175 for (int i = 0; i < SplitNumElements; ++i) {
15176 int M = HalfMask[i];
15177 if (M >= NumElements) {
15178 V2BlendMask[i] = M - NumElements;
15179 BlendMask[i] = SplitNumElements + i;
15180 } else if (M >= 0) {
15181 V1BlendMask[i] = M;
15182 BlendMask[i] = i;
15183 }
15184 }
15185
15186 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15187 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15188
15189 // Because the lowering happens after all combining takes place, we need to
15190 // manually combine these blend masks as much as possible so that we create
15191 // a minimal number of high-level vector shuffle nodes.
15192 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
15193
15194 // First try just blending the halves of V1 or V2.
15195 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15196 return DAG.getUNDEF(SplitVT);
15197 if (!UseLoV2 && !UseHiV2)
15198 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15199 if (!UseLoV1 && !UseHiV1)
15200 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15201
15202 SDValue V1Blend, V2Blend;
15203 if (UseLoV1 && UseHiV1) {
15204 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15205 } else {
15206 // We only use half of V1 so map the usage down into the final blend mask.
15207 V1Blend = UseLoV1 ? LoV1 : HiV1;
15208 for (int i = 0; i < SplitNumElements; ++i)
15209 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15210 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15211 }
15212 if (UseLoV2 && UseHiV2) {
15213 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15214 } else {
15215 // We only use half of V2 so map the usage down into the final blend mask.
15216 V2Blend = UseLoV2 ? LoV2 : HiV2;
15217 for (int i = 0; i < SplitNumElements; ++i)
15218 if (BlendMask[i] >= SplitNumElements)
15219 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15220 }
15221 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15222 };
15223
15224 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
15225 return SDValue();
15226
15227 SDValue Lo = HalfBlend(LoMask);
15228 SDValue Hi = HalfBlend(HiMask);
15229 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15230}
15231
15232/// Either split a vector in halves or decompose the shuffles and the
15233/// blend/unpack.
15234///
15235/// This is provided as a good fallback for many lowerings of non-single-input
15236/// shuffles with more than one 128-bit lane. In those cases, we want to select
15237/// between splitting the shuffle into 128-bit components and stitching those
15238/// back together vs. extracting the single-input shuffles and blending those
15239/// results.
15241 SDValue V2, ArrayRef<int> Mask,
15242 const APInt &Zeroable,
15243 const X86Subtarget &Subtarget,
15244 SelectionDAG &DAG) {
15245 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
15246 "shuffles as it could then recurse on itself.");
15247 int Size = Mask.size();
15248
15249 // If this can be modeled as a broadcast of two elements followed by a blend,
15250 // prefer that lowering. This is especially important because broadcasts can
15251 // often fold with memory operands.
15252 auto DoBothBroadcast = [&] {
15253 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15254 for (int M : Mask)
15255 if (M >= Size) {
15256 if (V2BroadcastIdx < 0)
15257 V2BroadcastIdx = M - Size;
15258 else if ((M - Size) != V2BroadcastIdx &&
15259 !IsElementEquivalent(Size, V2, V2, M - Size, V2BroadcastIdx))
15260 return false;
15261 } else if (M >= 0) {
15262 if (V1BroadcastIdx < 0)
15263 V1BroadcastIdx = M;
15264 else if (M != V1BroadcastIdx &&
15265 !IsElementEquivalent(Size, V1, V1, M, V1BroadcastIdx))
15266 return false;
15267 }
15268 return true;
15269 };
15270 if (DoBothBroadcast())
15271 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15272 Subtarget, DAG);
15273
15274 // If the inputs all stem from a single 128-bit lane of each input, then we
15275 // split them rather than blending because the split will decompose to
15276 // unusually few instructions.
15277 int LaneCount = VT.getSizeInBits() / 128;
15278 int LaneSize = Size / LaneCount;
15279 SmallBitVector LaneInputs[2];
15280 LaneInputs[0].resize(LaneCount, false);
15281 LaneInputs[1].resize(LaneCount, false);
15282 for (int i = 0; i < Size; ++i)
15283 if (Mask[i] >= 0)
15284 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15285 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15286 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15287 /*SimpleOnly*/ false);
15288
15289 // Without AVX2, if we can freely split the subvectors then we're better off
15290 // performing half width shuffles.
15291 if (!Subtarget.hasAVX2()) {
15292 SDValue BC1 = peekThroughBitcasts(V1);
15293 SDValue BC2 = peekThroughBitcasts(V2);
15294 bool SplatOrSplitV1 = isFreeToSplitVector(BC1, DAG) ||
15295 DAG.isSplatValue(BC1, /*AllowUndefs=*/true);
15296 bool SplatOrSplitV2 = isFreeToSplitVector(BC2, DAG) ||
15297 DAG.isSplatValue(BC2, /*AllowUndefs=*/true);
15298 if (SplatOrSplitV1 && SplatOrSplitV2)
15299 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15300 /*SimpleOnly*/ false);
15301 }
15302
15303 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15304 // requires that the decomposed single-input shuffles don't end up here.
15305 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15306 Subtarget, DAG);
15307}
15308
15309// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15310// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15312 SDValue V1, SDValue V2,
15313 ArrayRef<int> Mask,
15314 SelectionDAG &DAG) {
15315 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15316
15317 int LHSMask[4] = {-1, -1, -1, -1};
15318 int RHSMask[4] = {-1, -1, -1, -1};
15319 int SHUFPDMask[4] = {-1, -1, -1, -1};
15320
15321 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15322 // perform the shuffle once the lanes have been shuffled in place.
15323 for (int i = 0; i != 4; ++i) {
15324 int M = Mask[i];
15325 if (M < 0)
15326 continue;
15327 int LaneBase = i & ~1;
15328 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15329 LaneMask[LaneBase + (M & 1)] = M;
15330 SHUFPDMask[i] = M & 1;
15331 }
15332
15333 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15334 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15335 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15336 getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
15337}
15338
15339/// Lower a vector shuffle crossing multiple 128-bit lanes as
15340/// a lane permutation followed by a per-lane permutation.
15341///
15342/// This is mainly for cases where we can have non-repeating permutes
15343/// in each lane.
15344///
15345/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15346/// we should investigate merging them.
15348 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15349 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15350 int NumElts = VT.getVectorNumElements();
15351 int NumLanes = VT.getSizeInBits() / 128;
15352 int NumEltsPerLane = NumElts / NumLanes;
15353 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15354
15355 /// Attempts to find a sublane permute with the given size
15356 /// that gets all elements into their target lanes.
15357 ///
15358 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15359 /// If unsuccessful, returns false and may overwrite InLaneMask.
15360 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15361 int NumSublanesPerLane = NumSublanes / NumLanes;
15362 int NumEltsPerSublane = NumElts / NumSublanes;
15363
15364 SmallVector<int, 16> CrossLaneMask;
15365 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15366 // CrossLaneMask but one entry == one sublane.
15367 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15368 APInt DemandedCrossLane = APInt::getZero(NumElts);
15369
15370 for (int i = 0; i != NumElts; ++i) {
15371 int M = Mask[i];
15372 if (M < 0)
15373 continue;
15374
15375 int SrcSublane = M / NumEltsPerSublane;
15376 int DstLane = i / NumEltsPerLane;
15377
15378 // We only need to get the elements into the right lane, not sublane.
15379 // So search all sublanes that make up the destination lane.
15380 bool Found = false;
15381 int DstSubStart = DstLane * NumSublanesPerLane;
15382 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15383 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15384 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15385 continue;
15386
15387 Found = true;
15388 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15389 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15390 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15391 DemandedCrossLane.setBit(InLaneMask[i]);
15392 break;
15393 }
15394 if (!Found)
15395 return SDValue();
15396 }
15397
15398 // Fill CrossLaneMask using CrossLaneMaskLarge.
15399 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15400
15401 if (!CanUseSublanes) {
15402 // If we're only shuffling a single lowest lane and the rest are identity
15403 // then don't bother.
15404 // TODO - isShuffleMaskInputInPlace could be extended to something like
15405 // this.
15406 int NumIdentityLanes = 0;
15407 bool OnlyShuffleLowestLane = true;
15408 for (int i = 0; i != NumLanes; ++i) {
15409 int LaneOffset = i * NumEltsPerLane;
15410 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15411 i * NumEltsPerLane))
15412 NumIdentityLanes++;
15413 else if (CrossLaneMask[LaneOffset] != 0)
15414 OnlyShuffleLowestLane = false;
15415 }
15416 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15417 return SDValue();
15418 }
15419
15420 // Simplify CrossLaneMask based on the actual demanded elements.
15421 if (V1.hasOneUse())
15422 for (int i = 0; i != NumElts; ++i)
15423 if (!DemandedCrossLane[i])
15424 CrossLaneMask[i] = SM_SentinelUndef;
15425
15426 // Avoid returning the same shuffle operation. For example,
15427 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
15428 // undef:v16i16
15429 if (CrossLaneMask == Mask || InLaneMask == Mask)
15430 return SDValue();
15431
15432 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15433 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15434 InLaneMask);
15435 };
15436
15437 // First attempt a solution with full lanes.
15438 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15439 return V;
15440
15441 // The rest of the solutions use sublanes.
15442 if (!CanUseSublanes)
15443 return SDValue();
15444
15445 // Then attempt a solution with 64-bit sublanes (vpermq).
15446 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15447 return V;
15448
15449 // If that doesn't work and we have fast variable cross-lane shuffle,
15450 // attempt 32-bit sublanes (vpermd).
15451 if (!Subtarget.hasFastVariableCrossLaneShuffle())
15452 return SDValue();
15453
15454 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
15455}
15456
15457/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
15458static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
15459 SmallVector<int> &InLaneMask) {
15460 int Size = Mask.size();
15461 InLaneMask.assign(Mask.begin(), Mask.end());
15462 for (int i = 0; i < Size; ++i) {
15463 int &M = InLaneMask[i];
15464 if (M < 0)
15465 continue;
15466 if (((M % Size) / LaneSize) != (i / LaneSize))
15467 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
15468 }
15469}
15470
15471/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15472/// source with a lane permutation.
15473///
15474/// This lowering strategy results in four instructions in the worst case for a
15475/// single-input cross lane shuffle which is lower than any other fully general
15476/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15477/// shuffle pattern should be handled prior to trying this lowering.
15479 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15480 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15481 // FIXME: This should probably be generalized for 512-bit vectors as well.
15482 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
15483 int Size = Mask.size();
15484 int LaneSize = Size / 2;
15485
15486 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15487 // Only do this if the elements aren't all from the lower lane,
15488 // otherwise we're (probably) better off doing a split.
15489 if (VT == MVT::v4f64 &&
15490 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15491 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
15492
15493 // If there are only inputs from one 128-bit lane, splitting will in fact be
15494 // less expensive. The flags track whether the given lane contains an element
15495 // that crosses to another lane.
15496 bool AllLanes;
15497 if (!Subtarget.hasAVX2()) {
15498 bool LaneCrossing[2] = {false, false};
15499 for (int i = 0; i < Size; ++i)
15500 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
15501 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
15502 AllLanes = LaneCrossing[0] && LaneCrossing[1];
15503 } else {
15504 bool LaneUsed[2] = {false, false};
15505 for (int i = 0; i < Size; ++i)
15506 if (Mask[i] >= 0)
15507 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
15508 AllLanes = LaneUsed[0] && LaneUsed[1];
15509 }
15510
15511 // TODO - we could support shuffling V2 in the Flipped input.
15512 assert(V2.isUndef() &&
15513 "This last part of this routine only works on single input shuffles");
15514
15515 SmallVector<int> InLaneMask;
15516 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
15517
15518 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
15519 "In-lane shuffle mask expected");
15520
15521 // If we're not using both lanes in each lane and the inlane mask is not
15522 // repeating, then we're better off splitting.
15523 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
15524 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15525 /*SimpleOnly*/ false);
15526
15527 // Flip the lanes, and shuffle the results which should now be in-lane.
15528 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
15529 SDValue Flipped = DAG.getBitcast(PVT, V1);
15530 Flipped =
15531 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
15532 Flipped = DAG.getBitcast(VT, Flipped);
15533 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
15534}
15535
15536/// Handle lowering 2-lane 128-bit shuffles.
15538 SDValue V2, ArrayRef<int> Mask,
15539 const APInt &Zeroable,
15540 const X86Subtarget &Subtarget,
15541 SelectionDAG &DAG) {
15542 if (V2.isUndef()) {
15543 // Attempt to match VBROADCAST*128 subvector broadcast load.
15544 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
15545 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
15546 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
15548 MVT MemVT = VT.getHalfNumVectorElementsVT();
15549 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
15552 VT, MemVT, Ld, Ofs, DAG))
15553 return BcstLd;
15554 }
15555
15556 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
15557 if (Subtarget.hasAVX2())
15558 return SDValue();
15559 }
15560
15561 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
15562
15563 SmallVector<int, 4> WidenedMask;
15564 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
15565 return SDValue();
15566
15567 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15568 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15569
15570 // Try to use an insert into a zero vector.
15571 if (WidenedMask[0] == 0 && IsHighZero) {
15572 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15573 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15574 DAG.getVectorIdxConstant(0, DL));
15575 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15576 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15577 DAG.getVectorIdxConstant(0, DL));
15578 }
15579
15580 // TODO: If minimizing size and one of the inputs is a zero vector and the
15581 // the zero vector has only one use, we could use a VPERM2X128 to save the
15582 // instruction bytes needed to explicitly generate the zero vector.
15583
15584 // Blends are faster and handle all the non-lane-crossing cases.
15585 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
15586 Subtarget, DAG))
15587 return Blend;
15588
15589 // If either input operand is a zero vector, use VPERM2X128 because its mask
15590 // allows us to replace the zero input with an implicit zero.
15591 if (!IsLowZero && !IsHighZero) {
15592 // Check for patterns which can be matched with a single insert of a 128-bit
15593 // subvector.
15594 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
15595 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
15596
15597 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
15598 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15600 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15601 SDValue SubVec =
15602 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
15603 DAG.getVectorIdxConstant(0, DL));
15604 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15605 DAG.getVectorIdxConstant(2, DL));
15606 }
15607 }
15608
15609 // Try to use SHUF128 if possible.
15610 if (Subtarget.hasVLX()) {
15611 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15612 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15613 ((WidenedMask[1] % 2) << 1);
15614 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15615 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15616 }
15617 }
15618 }
15619
15620 // Otherwise form a 128-bit permutation. After accounting for undefs,
15621 // convert the 64-bit shuffle mask selection values into 128-bit
15622 // selection bits by dividing the indexes by 2 and shifting into positions
15623 // defined by a vperm2*128 instruction's immediate control byte.
15624
15625 // The immediate permute control byte looks like this:
15626 // [1:0] - select 128 bits from sources for low half of destination
15627 // [2] - ignore
15628 // [3] - zero low half of destination
15629 // [5:4] - select 128 bits from sources for high half of destination
15630 // [6] - ignore
15631 // [7] - zero high half of destination
15632
15633 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15634 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15635
15636 unsigned PermMask = 0;
15637 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15638 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15639
15640 // Check the immediate mask and replace unused sources with undef.
15641 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15642 V1 = DAG.getUNDEF(VT);
15643 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15644 V2 = DAG.getUNDEF(VT);
15645
15646 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15647 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15648}
15649
15650/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15651/// shuffling each lane.
15652///
15653/// This attempts to create a repeated lane shuffle where each lane uses one
15654/// or two of the lanes of the inputs. The lanes of the input vectors are
15655/// shuffled in one or two independent shuffles to get the lanes into the
15656/// position needed by the final shuffle.
15658 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15659 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15660 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15661
15662 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15663 return SDValue();
15664
15665 int NumElts = Mask.size();
15666 int NumLanes = VT.getSizeInBits() / 128;
15667 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15668 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15669 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15670
15671 // First pass will try to fill in the RepeatMask from lanes that need two
15672 // sources.
15673 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15674 int Srcs[2] = {-1, -1};
15675 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15676 for (int i = 0; i != NumLaneElts; ++i) {
15677 int M = Mask[(Lane * NumLaneElts) + i];
15678 if (M < 0)
15679 continue;
15680 // Determine which of the possible input lanes (NumLanes from each source)
15681 // this element comes from. Assign that as one of the sources for this
15682 // lane. We can assign up to 2 sources for this lane. If we run out
15683 // sources we can't do anything.
15684 int LaneSrc = M / NumLaneElts;
15685 int Src;
15686 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15687 Src = 0;
15688 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15689 Src = 1;
15690 else
15691 return SDValue();
15692
15693 Srcs[Src] = LaneSrc;
15694 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15695 }
15696
15697 // If this lane has two sources, see if it fits with the repeat mask so far.
15698 if (Srcs[1] < 0)
15699 continue;
15700
15701 LaneSrcs[Lane][0] = Srcs[0];
15702 LaneSrcs[Lane][1] = Srcs[1];
15703
15704 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15705 assert(M1.size() == M2.size() && "Unexpected mask size");
15706 for (int i = 0, e = M1.size(); i != e; ++i)
15707 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15708 return false;
15709 return true;
15710 };
15711
15712 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15713 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15714 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15715 int M = Mask[i];
15716 if (M < 0)
15717 continue;
15718 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15719 "Unexpected mask element");
15720 MergedMask[i] = M;
15721 }
15722 };
15723
15724 if (MatchMasks(InLaneMask, RepeatMask)) {
15725 // Merge this lane mask into the final repeat mask.
15726 MergeMasks(InLaneMask, RepeatMask);
15727 continue;
15728 }
15729
15730 // Didn't find a match. Swap the operands and try again.
15731 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15733
15734 if (MatchMasks(InLaneMask, RepeatMask)) {
15735 // Merge this lane mask into the final repeat mask.
15736 MergeMasks(InLaneMask, RepeatMask);
15737 continue;
15738 }
15739
15740 // Couldn't find a match with the operands in either order.
15741 return SDValue();
15742 }
15743
15744 // Now handle any lanes with only one source.
15745 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15746 // If this lane has already been processed, skip it.
15747 if (LaneSrcs[Lane][0] >= 0)
15748 continue;
15749
15750 for (int i = 0; i != NumLaneElts; ++i) {
15751 int M = Mask[(Lane * NumLaneElts) + i];
15752 if (M < 0)
15753 continue;
15754
15755 // If RepeatMask isn't defined yet we can define it ourself.
15756 if (RepeatMask[i] < 0)
15757 RepeatMask[i] = M % NumLaneElts;
15758
15759 if (RepeatMask[i] < NumElts) {
15760 if (RepeatMask[i] != M % NumLaneElts)
15761 return SDValue();
15762 LaneSrcs[Lane][0] = M / NumLaneElts;
15763 } else {
15764 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15765 return SDValue();
15766 LaneSrcs[Lane][1] = M / NumLaneElts;
15767 }
15768 }
15769
15770 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15771 return SDValue();
15772 }
15773
15774 SmallVector<int, 16> NewMask(NumElts, -1);
15775 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15776 int Src = LaneSrcs[Lane][0];
15777 for (int i = 0; i != NumLaneElts; ++i) {
15778 int M = -1;
15779 if (Src >= 0)
15780 M = Src * NumLaneElts + i;
15781 NewMask[Lane * NumLaneElts + i] = M;
15782 }
15783 }
15784 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15785 // Ensure we didn't get back the shuffle we started with.
15786 // FIXME: This is a hack to make up for some splat handling code in
15787 // getVectorShuffle.
15788 if (isa<ShuffleVectorSDNode>(NewV1) &&
15789 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15790 return SDValue();
15791
15792 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15793 int Src = LaneSrcs[Lane][1];
15794 for (int i = 0; i != NumLaneElts; ++i) {
15795 int M = -1;
15796 if (Src >= 0)
15797 M = Src * NumLaneElts + i;
15798 NewMask[Lane * NumLaneElts + i] = M;
15799 }
15800 }
15801 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15802 // Ensure we didn't get back the shuffle we started with.
15803 // FIXME: This is a hack to make up for some splat handling code in
15804 // getVectorShuffle.
15805 if (isa<ShuffleVectorSDNode>(NewV2) &&
15806 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15807 return SDValue();
15808
15809 for (int i = 0; i != NumElts; ++i) {
15810 if (Mask[i] < 0) {
15811 NewMask[i] = -1;
15812 continue;
15813 }
15814 NewMask[i] = RepeatMask[i % NumLaneElts];
15815 if (NewMask[i] < 0)
15816 continue;
15817
15818 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15819 }
15820 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15821}
15822
15823/// If the input shuffle mask results in a vector that is undefined in all upper
15824/// or lower half elements and that mask accesses only 2 halves of the
15825/// shuffle's operands, return true. A mask of half the width with mask indexes
15826/// adjusted to access the extracted halves of the original shuffle operands is
15827/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15828/// lower half of each input operand is accessed.
15829static bool
15831 int &HalfIdx1, int &HalfIdx2) {
15832 assert((Mask.size() == HalfMask.size() * 2) &&
15833 "Expected input mask to be twice as long as output");
15834
15835 // Exactly one half of the result must be undef to allow narrowing.
15836 bool UndefLower = isUndefLowerHalf(Mask);
15837 bool UndefUpper = isUndefUpperHalf(Mask);
15838 if (UndefLower == UndefUpper)
15839 return false;
15840
15841 unsigned HalfNumElts = HalfMask.size();
15842 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15843 HalfIdx1 = -1;
15844 HalfIdx2 = -1;
15845 for (unsigned i = 0; i != HalfNumElts; ++i) {
15846 int M = Mask[i + MaskIndexOffset];
15847 if (M < 0) {
15848 HalfMask[i] = M;
15849 continue;
15850 }
15851
15852 // Determine which of the 4 half vectors this element is from.
15853 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15854 int HalfIdx = M / HalfNumElts;
15855
15856 // Determine the element index into its half vector source.
15857 int HalfElt = M % HalfNumElts;
15858
15859 // We can shuffle with up to 2 half vectors, set the new 'half'
15860 // shuffle mask accordingly.
15861 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15862 HalfMask[i] = HalfElt;
15863 HalfIdx1 = HalfIdx;
15864 continue;
15865 }
15866 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15867 HalfMask[i] = HalfElt + HalfNumElts;
15868 HalfIdx2 = HalfIdx;
15869 continue;
15870 }
15871
15872 // Too many half vectors referenced.
15873 return false;
15874 }
15875
15876 return true;
15877}
15878
15879/// Given the output values from getHalfShuffleMask(), create a half width
15880/// shuffle of extracted vectors followed by an insert back to full width.
15882 ArrayRef<int> HalfMask, int HalfIdx1,
15883 int HalfIdx2, bool UndefLower,
15884 SelectionDAG &DAG, bool UseConcat = false) {
15885 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15886 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15887
15888 MVT VT = V1.getSimpleValueType();
15889 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15890 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15891
15892 auto getHalfVector = [&](int HalfIdx) {
15893 if (HalfIdx < 0)
15894 return DAG.getUNDEF(HalfVT);
15895 SDValue V = (HalfIdx < 2 ? V1 : V2);
15896 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15897 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15898 DAG.getVectorIdxConstant(HalfIdx, DL));
15899 };
15900
15901 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15902 SDValue Half1 = getHalfVector(HalfIdx1);
15903 SDValue Half2 = getHalfVector(HalfIdx2);
15904 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15905 if (UseConcat) {
15906 SDValue Op0 = V;
15907 SDValue Op1 = DAG.getUNDEF(HalfVT);
15908 if (UndefLower)
15909 std::swap(Op0, Op1);
15910 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15911 }
15912
15913 unsigned Offset = UndefLower ? HalfNumElts : 0;
15914 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15916}
15917
15918/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15919/// This allows for fast cases such as subvector extraction/insertion
15920/// or shuffling smaller vector types which can lower more efficiently.
15922 SDValue V2, ArrayRef<int> Mask,
15923 const X86Subtarget &Subtarget,
15924 SelectionDAG &DAG) {
15925 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15926 "Expected 256-bit or 512-bit vector");
15927
15928 bool UndefLower = isUndefLowerHalf(Mask);
15929 if (!UndefLower && !isUndefUpperHalf(Mask))
15930 return SDValue();
15931
15932 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15933 "Completely undef shuffle mask should have been simplified already");
15934
15935 // Upper half is undef and lower half is whole upper subvector.
15936 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15937 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15938 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15939 if (!UndefLower &&
15940 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15941 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15942 DAG.getVectorIdxConstant(HalfNumElts, DL));
15943 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15944 DAG.getVectorIdxConstant(0, DL));
15945 }
15946
15947 // Lower half is undef and upper half is whole lower subvector.
15948 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15949 if (UndefLower &&
15950 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15951 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15952 DAG.getVectorIdxConstant(0, DL));
15953 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15954 DAG.getVectorIdxConstant(HalfNumElts, DL));
15955 }
15956
15957 int HalfIdx1, HalfIdx2;
15958 SmallVector<int, 8> HalfMask(HalfNumElts);
15959 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15960 return SDValue();
15961
15962 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15963
15964 // Only shuffle the halves of the inputs when useful.
15965 unsigned NumLowerHalves =
15966 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15967 unsigned NumUpperHalves =
15968 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15969 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15970
15971 // Determine the larger pattern of undef/halves, then decide if it's worth
15972 // splitting the shuffle based on subtarget capabilities and types.
15973 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
15974 if (!UndefLower) {
15975 // XXXXuuuu: no insert is needed.
15976 // Always extract lowers when setting lower - these are all free subreg ops.
15977 if (NumUpperHalves == 0)
15978 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15979 UndefLower, DAG);
15980
15981 if (NumUpperHalves == 1) {
15982 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
15983 if (Subtarget.hasAVX2()) {
15984 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
15985 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
15986 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
15987 (!isSingleSHUFPSMask(HalfMask) ||
15988 Subtarget.hasFastVariableCrossLaneShuffle()))
15989 return SDValue();
15990 // If this is an unary shuffle (assume that the 2nd operand is
15991 // canonicalized to undef), then we can use vpermpd. Otherwise, we
15992 // are better off extracting the upper half of 1 operand and using a
15993 // narrow shuffle.
15994 if (EltWidth == 64 && V2.isUndef())
15995 return SDValue();
15996 // If this is an unary vXi8 shuffle with inplace halves, then perform as
15997 // full width pshufb, and then merge.
15998 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
15999 return SDValue();
16000 }
16001 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16002 if (Subtarget.hasAVX512() && VT.is512BitVector())
16003 return SDValue();
16004 // Extract + narrow shuffle is better than the wide alternative.
16005 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16006 UndefLower, DAG);
16007 }
16008
16009 // Don't extract both uppers, instead shuffle and then extract.
16010 assert(NumUpperHalves == 2 && "Half vector count went wrong");
16011 return SDValue();
16012 }
16013
16014 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16015 if (NumUpperHalves == 0) {
16016 // AVX2 has efficient 64-bit element cross-lane shuffles.
16017 // TODO: Refine to account for unary shuffle, splat, and other masks?
16018 if (Subtarget.hasAVX2() && EltWidth == 64)
16019 return SDValue();
16020 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16021 if (Subtarget.hasAVX512() && VT.is512BitVector())
16022 return SDValue();
16023 // Narrow shuffle + insert is better than the wide alternative.
16024 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16025 UndefLower, DAG);
16026 }
16027
16028 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16029 return SDValue();
16030}
16031
16032/// Handle case where shuffle sources are coming from the same 128-bit lane and
16033/// every lane can be represented as the same repeating mask - allowing us to
16034/// shuffle the sources with the repeating shuffle and then permute the result
16035/// to the destination lanes.
16037 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16038 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16039 int NumElts = VT.getVectorNumElements();
16040 int NumLanes = VT.getSizeInBits() / 128;
16041 int NumLaneElts = NumElts / NumLanes;
16042
16043 // On AVX2 we may be able to just shuffle the lowest elements and then
16044 // broadcast the result.
16045 if (Subtarget.hasAVX2()) {
16046 for (unsigned BroadcastSize : {16, 32, 64}) {
16047 if (BroadcastSize <= VT.getScalarSizeInBits())
16048 continue;
16049 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16050
16051 // Attempt to match a repeating pattern every NumBroadcastElts,
16052 // accounting for UNDEFs but only references the lowest 128-bit
16053 // lane of the inputs.
16054 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16055 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16056 for (int j = 0; j != NumBroadcastElts; ++j) {
16057 int M = Mask[i + j];
16058 if (M < 0)
16059 continue;
16060 int &R = RepeatMask[j];
16061 if (0 != ((M % NumElts) / NumLaneElts))
16062 return false;
16063 if (0 <= R && R != M)
16064 return false;
16065 R = M;
16066 }
16067 return true;
16068 };
16069
16070 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16071 if (!FindRepeatingBroadcastMask(RepeatMask))
16072 continue;
16073
16074 // Shuffle the (lowest) repeated elements in place for broadcast.
16075 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16076
16077 // Shuffle the actual broadcast.
16078 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16079 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16080 for (int j = 0; j != NumBroadcastElts; ++j)
16081 BroadcastMask[i + j] = j;
16082
16083 // Avoid returning the same shuffle operation. For example,
16084 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
16085 if (BroadcastMask == Mask)
16086 return SDValue();
16087
16088 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16089 BroadcastMask);
16090 }
16091 }
16092
16093 // Bail if the shuffle mask doesn't cross 128-bit lanes.
16094 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16095 return SDValue();
16096
16097 // Bail if we already have a repeated lane shuffle mask.
16098 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16099 return SDValue();
16100
16101 // Helper to look for repeated mask in each split sublane, and that those
16102 // sublanes can then be permuted into place.
16103 auto ShuffleSubLanes = [&](int SubLaneScale) {
16104 int NumSubLanes = NumLanes * SubLaneScale;
16105 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16106
16107 // Check that all the sources are coming from the same lane and see if we
16108 // can form a repeating shuffle mask (local to each sub-lane). At the same
16109 // time, determine the source sub-lane for each destination sub-lane.
16110 int TopSrcSubLane = -1;
16111 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16112 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
16113 SubLaneScale,
16114 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
16115
16116 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16117 // Extract the sub-lane mask, check that it all comes from the same lane
16118 // and normalize the mask entries to come from the first lane.
16119 int SrcLane = -1;
16120 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16121 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16122 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16123 if (M < 0)
16124 continue;
16125 int Lane = (M % NumElts) / NumLaneElts;
16126 if ((0 <= SrcLane) && (SrcLane != Lane))
16127 return SDValue();
16128 SrcLane = Lane;
16129 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16130 SubLaneMask[Elt] = LocalM;
16131 }
16132
16133 // Whole sub-lane is UNDEF.
16134 if (SrcLane < 0)
16135 continue;
16136
16137 // Attempt to match against the candidate repeated sub-lane masks.
16138 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16139 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16140 for (int i = 0; i != NumSubLaneElts; ++i) {
16141 if (M1[i] < 0 || M2[i] < 0)
16142 continue;
16143 if (M1[i] != M2[i])
16144 return false;
16145 }
16146 return true;
16147 };
16148
16149 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16150 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16151 continue;
16152
16153 // Merge the sub-lane mask into the matching repeated sub-lane mask.
16154 for (int i = 0; i != NumSubLaneElts; ++i) {
16155 int M = SubLaneMask[i];
16156 if (M < 0)
16157 continue;
16158 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16159 "Unexpected mask element");
16160 RepeatedSubLaneMask[i] = M;
16161 }
16162
16163 // Track the top most source sub-lane - by setting the remaining to
16164 // UNDEF we can greatly simplify shuffle matching.
16165 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16166 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16167 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16168 break;
16169 }
16170
16171 // Bail if we failed to find a matching repeated sub-lane mask.
16172 if (Dst2SrcSubLanes[DstSubLane] < 0)
16173 return SDValue();
16174 }
16175 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16176 "Unexpected source lane");
16177
16178 // Create a repeating shuffle mask for the entire vector.
16179 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16180 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16181 int Lane = SubLane / SubLaneScale;
16182 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16183 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16184 int M = RepeatedSubLaneMask[Elt];
16185 if (M < 0)
16186 continue;
16187 int Idx = (SubLane * NumSubLaneElts) + Elt;
16188 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16189 }
16190 }
16191
16192 // Shuffle each source sub-lane to its destination.
16193 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16194 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16195 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16196 if (SrcSubLane < 0)
16197 continue;
16198 for (int j = 0; j != NumSubLaneElts; ++j)
16199 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16200 }
16201
16202 // Avoid returning the same shuffle operation.
16203 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
16204 if (RepeatedMask == Mask || SubLaneMask == Mask)
16205 return SDValue();
16206
16207 SDValue RepeatedShuffle =
16208 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16209
16210 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16211 SubLaneMask);
16212 };
16213
16214 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16215 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
16216 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
16217 // Otherwise we can only permute whole 128-bit lanes.
16218 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
16219 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
16220 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
16221 MinSubLaneScale = 2;
16222 MaxSubLaneScale =
16223 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
16224 }
16225 if (Subtarget.hasBWI() && VT == MVT::v64i8)
16226 MinSubLaneScale = MaxSubLaneScale = 4;
16227
16228 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
16229 if (SDValue Shuffle = ShuffleSubLanes(Scale))
16230 return Shuffle;
16231
16232 return SDValue();
16233}
16234
16236 bool &ForceV1Zero, bool &ForceV2Zero,
16237 unsigned &ShuffleImm, ArrayRef<int> Mask,
16238 const APInt &Zeroable) {
16239 int NumElts = VT.getVectorNumElements();
16240 assert(VT.getScalarSizeInBits() == 64 &&
16241 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16242 "Unexpected data type for VSHUFPD");
16243 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
16244 "Illegal shuffle mask");
16245
16246 bool ZeroLane[2] = { true, true };
16247 for (int i = 0; i < NumElts; ++i)
16248 ZeroLane[i & 1] &= Zeroable[i];
16249
16250 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
16251 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
16252 bool IsSHUFPD = true;
16253 bool IsCommutable = true;
16254 SmallVector<int, 8> SHUFPDMask(NumElts, -1);
16255 for (int i = 0; i < NumElts; ++i) {
16256 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16257 continue;
16258 if (Mask[i] < 0)
16259 return false;
16260 int Val = (i & 6) + NumElts * (i & 1);
16261 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16262 if (Mask[i] < Val || Mask[i] > Val + 1)
16263 IsSHUFPD = false;
16264 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16265 IsCommutable = false;
16266 SHUFPDMask[i] = Mask[i] % 2;
16267 }
16268
16269 if (!IsSHUFPD && !IsCommutable)
16270 return false;
16271
16272 if (!IsSHUFPD && IsCommutable)
16273 std::swap(V1, V2);
16274
16275 ForceV1Zero = ZeroLane[0];
16276 ForceV2Zero = ZeroLane[1];
16277 ShuffleImm = getSHUFPDImm(SHUFPDMask);
16278 return true;
16279}
16280
16282 SDValue V2, ArrayRef<int> Mask,
16283 const APInt &Zeroable,
16284 const X86Subtarget &Subtarget,
16285 SelectionDAG &DAG) {
16286 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16287 "Unexpected data type for VSHUFPD");
16288
16289 unsigned Immediate = 0;
16290 bool ForceV1Zero = false, ForceV2Zero = false;
16291 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16292 Mask, Zeroable))
16293 return SDValue();
16294
16295 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16296 if (ForceV1Zero)
16297 V1 = getZeroVector(VT, Subtarget, DAG, DL);
16298 if (ForceV2Zero)
16299 V2 = getZeroVector(VT, Subtarget, DAG, DL);
16300
16301 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16302 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16303}
16304
16305// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16306// by zeroable elements in the remaining 24 elements. Turn this into two
16307// vmovqb instructions shuffled together.
16309 SDValue V1, SDValue V2,
16310 ArrayRef<int> Mask,
16311 const APInt &Zeroable,
16312 SelectionDAG &DAG) {
16313 assert(VT == MVT::v32i8 && "Unexpected type!");
16314
16315 // The first 8 indices should be every 8th element.
16316 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16317 return SDValue();
16318
16319 // Remaining elements need to be zeroable.
16320 if (Zeroable.countl_one() < (Mask.size() - 8))
16321 return SDValue();
16322
16323 V1 = DAG.getBitcast(MVT::v4i64, V1);
16324 V2 = DAG.getBitcast(MVT::v4i64, V2);
16325
16326 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16327 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16328
16329 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16330 // the upper bits of the result using an unpckldq.
16331 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16332 { 0, 1, 2, 3, 16, 17, 18, 19,
16333 4, 5, 6, 7, 20, 21, 22, 23 });
16334 // Insert the unpckldq into a zero vector to widen to v32i8.
16335 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16336 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16337 DAG.getVectorIdxConstant(0, DL));
16338}
16339
16340// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
16341// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
16342// =>
16343// ul = unpckl v1, v2
16344// uh = unpckh v1, v2
16345// a = vperm ul, uh
16346// b = vperm ul, uh
16347//
16348// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16349// and permute. We cannot directly match v3 because it is split into two
16350// 256-bit vectors in earlier isel stages. Therefore, this function matches a
16351// pair of 256-bit shuffles and makes sure the masks are consecutive.
16352//
16353// Once unpck and permute nodes are created, the permute corresponding to this
16354// shuffle is returned, while the other permute replaces the other half of the
16355// shuffle in the selection dag.
16357 SDValue V1, SDValue V2,
16358 ArrayRef<int> Mask,
16359 SelectionDAG &DAG) {
16360 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16361 VT != MVT::v32i8)
16362 return SDValue();
16363 // <B0, B1, B0+1, B1+1, ..., >
16364 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
16365 unsigned Begin1) {
16366 size_t Size = Mask.size();
16367 assert(Size % 2 == 0 && "Expected even mask size");
16368 for (unsigned I = 0; I < Size; I += 2) {
16369 if (Mask[I] != (int)(Begin0 + I / 2) ||
16370 Mask[I + 1] != (int)(Begin1 + I / 2))
16371 return false;
16372 }
16373 return true;
16374 };
16375 // Check which half is this shuffle node
16376 int NumElts = VT.getVectorNumElements();
16377 size_t FirstQtr = NumElts / 2;
16378 size_t ThirdQtr = NumElts + NumElts / 2;
16379 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16380 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16381 if (!IsFirstHalf && !IsSecondHalf)
16382 return SDValue();
16383
16384 // Find the intersection between shuffle users of V1 and V2.
16385 SmallVector<SDNode *, 2> Shuffles;
16386 for (SDNode *User : V1->users())
16387 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
16388 User->getOperand(1) == V2)
16389 Shuffles.push_back(User);
16390 // Limit user size to two for now.
16391 if (Shuffles.size() != 2)
16392 return SDValue();
16393 // Find out which half of the 512-bit shuffles is each smaller shuffle
16394 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
16395 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
16396 SDNode *FirstHalf;
16397 SDNode *SecondHalf;
16398 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16399 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16400 FirstHalf = Shuffles[0];
16401 SecondHalf = Shuffles[1];
16402 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16403 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16404 FirstHalf = Shuffles[1];
16405 SecondHalf = Shuffles[0];
16406 } else {
16407 return SDValue();
16408 }
16409 // Lower into unpck and perm. Return the perm of this shuffle and replace
16410 // the other.
16411 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
16412 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
16413 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16414 DAG.getTargetConstant(0x20, DL, MVT::i8));
16415 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16416 DAG.getTargetConstant(0x31, DL, MVT::i8));
16417 if (IsFirstHalf) {
16418 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
16419 return Perm1;
16420 }
16421 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
16422 return Perm2;
16423}
16424
16425/// Handle lowering of 4-lane 64-bit floating point shuffles.
16426///
16427/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16428/// isn't available.
16430 const APInt &Zeroable, SDValue V1, SDValue V2,
16431 const X86Subtarget &Subtarget,
16432 SelectionDAG &DAG) {
16433 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16434 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16435 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16436
16437 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16438 Subtarget, DAG))
16439 return V;
16440
16441 if (V2.isUndef()) {
16442 // Check for being able to broadcast a single element.
16443 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16444 Mask, Subtarget, DAG))
16445 return Broadcast;
16446
16447 // Use low duplicate instructions for masks that match their pattern.
16448 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16449 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16450
16451 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16452 // Non-half-crossing single input shuffles can be lowered with an
16453 // interleaved permutation.
16454 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16455 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16456 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16457 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16458 }
16459
16460 // With AVX2 we have direct support for this permutation.
16461 if (Subtarget.hasAVX2())
16462 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16463 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16464
16465 // Try to create an in-lane repeating shuffle mask and then shuffle the
16466 // results into the target lanes.
16468 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16469 return V;
16470
16471 // Try to permute the lanes and then use a per-lane permute.
16472 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16473 Mask, DAG, Subtarget))
16474 return V;
16475
16476 // Otherwise, fall back.
16477 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16478 DAG, Subtarget);
16479 }
16480
16481 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16482 Zeroable, Subtarget, DAG))
16483 return Blend;
16484
16485 // Use dedicated unpack instructions for masks that match their pattern.
16486 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, V1, V2, Mask, DAG))
16487 return V;
16488
16489 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16490 Zeroable, Subtarget, DAG))
16491 return Op;
16492
16493 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16494 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16495 bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask);
16496 bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask);
16497
16498 // If we have lane crossing shuffles AND they don't all come from the lower
16499 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16500 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16501 // canonicalize to a blend of splat which isn't necessary for this combine.
16502 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16503 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16504 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16505 (V2.getOpcode() != ISD::BUILD_VECTOR) &&
16506 (!Subtarget.hasAVX2() ||
16507 !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
16508 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
16509
16510 // If we have one input in place, then we can permute the other input and
16511 // blend the result.
16512 if (V1IsInPlace || V2IsInPlace)
16513 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16514 Zeroable, Subtarget, DAG);
16515
16516 // Try to create an in-lane repeating shuffle mask and then shuffle the
16517 // results into the target lanes.
16519 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16520 return V;
16521
16522 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16523 // shuffle. However, if we have AVX2 and either inputs are already in place,
16524 // we will be able to shuffle even across lanes the other input in a single
16525 // instruction so skip this pattern.
16526 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
16528 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16529 return V;
16530
16531 // If we have VLX support, we can use VEXPAND.
16532 if (Subtarget.hasVLX())
16533 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
16534 Zeroable, Subtarget, DAG))
16535 return V;
16536
16537 // If we have AVX2 then we always want to lower with a blend because an v4 we
16538 // can fully permute the elements.
16539 if (Subtarget.hasAVX2())
16540 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16541 Zeroable, Subtarget, DAG);
16542
16543 // Otherwise fall back on generic lowering.
16544 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16545 Subtarget, DAG);
16546}
16547
16548/// Handle lowering of 4-lane 64-bit integer shuffles.
16549///
16550/// This routine is only called when we have AVX2 and thus a reasonable
16551/// instruction set for v4i64 shuffling..
16553 const APInt &Zeroable, SDValue V1, SDValue V2,
16554 const X86Subtarget &Subtarget,
16555 SelectionDAG &DAG) {
16556 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16557 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16558 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16559 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
16560
16561 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16562 Subtarget, DAG))
16563 return V;
16564
16565 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16566 Zeroable, Subtarget, DAG))
16567 return Blend;
16568
16569 // Check for being able to broadcast a single element.
16570 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16571 Subtarget, DAG))
16572 return Broadcast;
16573
16574 // Try to use shift instructions if fast.
16575 if (Subtarget.preferLowerShuffleAsShift())
16576 if (SDValue Shift =
16577 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16578 Subtarget, DAG, /*BitwiseOnly*/ true))
16579 return Shift;
16580
16581 if (V2.isUndef()) {
16582 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16583 // can use lower latency instructions that will operate on both lanes.
16584 SmallVector<int, 2> RepeatedMask;
16585 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16586 SmallVector<int, 4> PSHUFDMask;
16587 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
16588 return DAG.getBitcast(
16589 MVT::v4i64,
16590 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16591 DAG.getBitcast(MVT::v8i32, V1),
16592 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16593 }
16594
16595 // AVX2 provides a direct instruction for permuting a single input across
16596 // lanes.
16597 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16598 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16599 }
16600
16601 // Try to use shift instructions.
16602 if (SDValue Shift =
16603 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
16604 DAG, /*BitwiseOnly*/ false))
16605 return Shift;
16606
16607 // If we have VLX support, we can use VALIGN or VEXPAND.
16608 if (Subtarget.hasVLX()) {
16609 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16610 Zeroable, Subtarget, DAG))
16611 return Rotate;
16612
16613 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
16614 Zeroable, Subtarget, DAG))
16615 return V;
16616 }
16617
16618 // Try to use PALIGNR.
16619 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16620 Subtarget, DAG))
16621 return Rotate;
16622
16623 // Use dedicated unpack instructions for masks that match their pattern.
16624 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, V1, V2, Mask, DAG))
16625 return V;
16626
16627 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16628 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16629
16630 // If we have one input in place, then we can permute the other input and
16631 // blend the result.
16632 if (V1IsInPlace || V2IsInPlace)
16633 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16634 Zeroable, Subtarget, DAG);
16635
16636 // Try to create an in-lane repeating shuffle mask and then shuffle the
16637 // results into the target lanes.
16639 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16640 return V;
16641
16642 // Try to lower to PERMQ(BLENDD(V1,V2)).
16643 if (SDValue V =
16644 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16645 return V;
16646
16647 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16648 // shuffle. However, if we have AVX2 and either inputs are already in place,
16649 // we will be able to shuffle even across lanes the other input in a single
16650 // instruction so skip this pattern.
16651 if (!V1IsInPlace && !V2IsInPlace)
16653 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16654 return Result;
16655
16656 // Otherwise fall back on generic blend lowering.
16657 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16658 Zeroable, Subtarget, DAG);
16659}
16660
16661/// Handle lowering of 8-lane 32-bit floating point shuffles.
16662///
16663/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16664/// isn't available.
16666 const APInt &Zeroable, SDValue V1, SDValue V2,
16667 const X86Subtarget &Subtarget,
16668 SelectionDAG &DAG) {
16669 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16670 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16671 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16672
16673 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16674 Zeroable, Subtarget, DAG))
16675 return Blend;
16676
16677 // Check for being able to broadcast a single element.
16678 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16679 Subtarget, DAG))
16680 return Broadcast;
16681
16682 if (!Subtarget.hasAVX2()) {
16683 SmallVector<int> InLaneMask;
16684 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16685
16686 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16687 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16688 /*SimpleOnly*/ true))
16689 return R;
16690 }
16691 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16692 Zeroable, Subtarget, DAG))
16693 return DAG.getBitcast(MVT::v8f32, ZExt);
16694
16695 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16696 // options to efficiently lower the shuffle.
16697 SmallVector<int, 4> RepeatedMask;
16698 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16699 assert(RepeatedMask.size() == 4 &&
16700 "Repeated masks must be half the mask width!");
16701
16702 // Use even/odd duplicate instructions for masks that match their pattern.
16703 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16704 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16705 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16706 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16707
16708 if (V2.isUndef())
16709 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16710 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16711
16712 // Use dedicated unpack instructions for masks that match their pattern.
16713 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, V1, V2, Mask, DAG))
16714 return V;
16715
16716 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16717 // have already handled any direct blends.
16718 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16719 }
16720
16721 // Try to create an in-lane repeating shuffle mask and then shuffle the
16722 // results into the target lanes.
16724 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16725 return V;
16726
16727 // If we have a single input shuffle with different shuffle patterns in the
16728 // two 128-bit lanes use the variable mask to VPERMILPS.
16729 if (V2.isUndef()) {
16730 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16731 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16732 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16733 }
16734 if (Subtarget.hasAVX2()) {
16735 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16736 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16737 }
16738 // Otherwise, fall back.
16739 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16740 DAG, Subtarget);
16741 }
16742
16743 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16744 // shuffle.
16746 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16747 return Result;
16748
16749 // If we have VLX support, we can use VEXPAND.
16750 if (Subtarget.hasVLX())
16751 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
16752 Zeroable, Subtarget, DAG))
16753 return V;
16754
16755 // Try to match an interleave of two v8f32s and lower them as unpck and
16756 // permutes using ymms. This needs to go before we try to split the vectors.
16757 // Don't attempt on AVX1 if we're likely to split vectors anyway.
16758 if ((Subtarget.hasAVX2() ||
16761 !Subtarget.hasAVX512())
16762 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
16763 Mask, DAG))
16764 return V;
16765
16766 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16767 // since after split we get a more efficient code using vpunpcklwd and
16768 // vpunpckhwd instrs than vblend.
16769 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16770 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16771 Subtarget, DAG);
16772
16773 // If we have AVX2 then we always want to lower with a blend because at v8 we
16774 // can fully permute the elements.
16775 if (Subtarget.hasAVX2())
16776 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16777 Zeroable, Subtarget, DAG);
16778
16779 // Otherwise fall back on generic lowering.
16780 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16781 Subtarget, DAG);
16782}
16783
16784/// Handle lowering of 8-lane 32-bit integer shuffles.
16785///
16786/// This routine is only called when we have AVX2 and thus a reasonable
16787/// instruction set for v8i32 shuffling..
16789 const APInt &Zeroable, SDValue V1, SDValue V2,
16790 const X86Subtarget &Subtarget,
16791 SelectionDAG &DAG) {
16792 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16793 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16794 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16795 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
16796
16797 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16798
16799 // Whenever we can lower this as a zext, that instruction is strictly faster
16800 // than any alternative. It also allows us to fold memory operands into the
16801 // shuffle in many cases.
16802 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16803 Zeroable, Subtarget, DAG))
16804 return ZExt;
16805
16806 // Try to match an interleave of two v8i32s and lower them as unpck and
16807 // permutes using ymms. This needs to go before we try to split the vectors.
16808 if (!Subtarget.hasAVX512())
16809 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
16810 Mask, DAG))
16811 return V;
16812
16813 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16814 // since after split we get a more efficient code than vblend by using
16815 // vpunpcklwd and vpunpckhwd instrs.
16816 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16817 !Subtarget.hasAVX512())
16818 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16819 Subtarget, DAG);
16820
16821 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16822 Zeroable, Subtarget, DAG))
16823 return Blend;
16824
16825 // Check for being able to broadcast a single element.
16826 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16827 Subtarget, DAG))
16828 return Broadcast;
16829
16830 // Try to use shift instructions if fast.
16831 if (Subtarget.preferLowerShuffleAsShift()) {
16832 if (SDValue Shift =
16833 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16834 Subtarget, DAG, /*BitwiseOnly*/ true))
16835 return Shift;
16836 if (NumV2Elements == 0)
16837 if (SDValue Rotate =
16838 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16839 return Rotate;
16840 }
16841
16842 // If the shuffle mask is repeated in each 128-bit lane we can use more
16843 // efficient instructions that mirror the shuffles across the two 128-bit
16844 // lanes.
16845 SmallVector<int, 4> RepeatedMask;
16846 bool Is128BitLaneRepeatedShuffle =
16847 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16848 if (Is128BitLaneRepeatedShuffle) {
16849 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16850 if (V2.isUndef())
16851 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16852 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16853
16854 // Use dedicated unpack instructions for masks that match their pattern.
16855 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, V1, V2, Mask, DAG))
16856 return V;
16857 }
16858
16859 // Try to use shift instructions.
16860 if (SDValue Shift =
16861 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16862 DAG, /*BitwiseOnly*/ false))
16863 return Shift;
16864
16865 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16866 if (SDValue Rotate =
16867 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16868 return Rotate;
16869
16870 // If we have VLX support, we can use VALIGN or EXPAND.
16871 if (Subtarget.hasVLX()) {
16872 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16873 Zeroable, Subtarget, DAG))
16874 return Rotate;
16875
16876 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
16877 Zeroable, Subtarget, DAG))
16878 return V;
16879 }
16880
16881 // Try to use byte rotation instructions.
16882 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16883 Subtarget, DAG))
16884 return Rotate;
16885
16886 // Try to create an in-lane repeating shuffle mask and then shuffle the
16887 // results into the target lanes.
16889 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16890 return V;
16891
16892 if (V2.isUndef()) {
16893 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16894 // because that should be faster than the variable permute alternatives.
16895 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, V1, V2, Mask, DAG))
16896 return V;
16897
16898 // If the shuffle patterns aren't repeated but it's a single input, directly
16899 // generate a cross-lane VPERMD instruction.
16900 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16901 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16902 }
16903
16904 // Assume that a single SHUFPS is faster than an alternative sequence of
16905 // multiple instructions (even if the CPU has a domain penalty).
16906 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16907 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16908 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16909 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16910 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16911 CastV1, CastV2, DAG);
16912 return DAG.getBitcast(MVT::v8i32, ShufPS);
16913 }
16914
16915 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16916 // shuffle.
16918 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16919 return Result;
16920
16921 // Otherwise fall back on generic blend lowering.
16922 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16923 Zeroable, Subtarget, DAG);
16924}
16925
16926/// Handle lowering of 16-lane 16-bit integer shuffles.
16927///
16928/// This routine is only called when we have AVX2 and thus a reasonable
16929/// instruction set for v16i16 shuffling..
16931 const APInt &Zeroable, SDValue V1, SDValue V2,
16932 const X86Subtarget &Subtarget,
16933 SelectionDAG &DAG) {
16934 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16935 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16936 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16937 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16938
16939 // Whenever we can lower this as a zext, that instruction is strictly faster
16940 // than any alternative. It also allows us to fold memory operands into the
16941 // shuffle in many cases.
16943 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16944 return ZExt;
16945
16946 // Check for being able to broadcast a single element.
16947 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16948 Subtarget, DAG))
16949 return Broadcast;
16950
16951 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16952 Zeroable, Subtarget, DAG))
16953 return Blend;
16954
16955 // Use dedicated unpack instructions for masks that match their pattern.
16956 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, V1, V2, Mask, DAG))
16957 return V;
16958
16959 // Use dedicated pack instructions for masks that match their pattern.
16960 if (SDValue V =
16961 lowerShuffleWithPACK(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16962 return V;
16963
16964 // Try to use lower using a truncation.
16965 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16966 Subtarget, DAG))
16967 return V;
16968
16969 // Try to use shift instructions.
16970 if (SDValue Shift =
16971 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16972 Subtarget, DAG, /*BitwiseOnly*/ false))
16973 return Shift;
16974
16975 // Try to use byte rotation instructions.
16976 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
16977 Subtarget, DAG))
16978 return Rotate;
16979
16980 // Try to create an in-lane repeating shuffle mask and then shuffle the
16981 // results into the target lanes.
16983 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16984 return V;
16985
16986 if (V2.isUndef()) {
16987 // Try to use bit rotation instructions.
16988 if (SDValue Rotate =
16989 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
16990 return Rotate;
16991
16992 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16993 // because that should be faster than the variable permute alternatives.
16994 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
16995 return V;
16996
16997 // There are no generalized cross-lane shuffle operations available on i16
16998 // element types.
16999 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17001 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17002 return V;
17003
17004 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17005 DAG, Subtarget);
17006 }
17007
17008 SmallVector<int, 8> RepeatedMask;
17009 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17010 // As this is a single-input shuffle, the repeated mask should be
17011 // a strictly valid v8i16 mask that we can pass through to the v8i16
17012 // lowering to handle even the v16 case.
17014 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17015 }
17016 }
17017
17018 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17019 Zeroable, Subtarget, DAG))
17020 return PSHUFB;
17021
17022 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17023 if (Subtarget.hasBWI())
17024 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17025
17026 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17027 // shuffle.
17029 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17030 return Result;
17031
17032 // Try to permute the lanes and then use a per-lane permute.
17034 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17035 return V;
17036
17037 // Try to match an interleave of two v16i16s and lower them as unpck and
17038 // permutes using ymms.
17039 if (!Subtarget.hasAVX512())
17040 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
17041 Mask, DAG))
17042 return V;
17043
17044 // Otherwise fall back on generic lowering.
17045 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17046 Subtarget, DAG);
17047}
17048
17049/// Handle lowering of 32-lane 8-bit integer shuffles.
17050///
17051/// This routine is only called when we have AVX2 and thus a reasonable
17052/// instruction set for v32i8 shuffling..
17054 const APInt &Zeroable, SDValue V1, SDValue V2,
17055 const X86Subtarget &Subtarget,
17056 SelectionDAG &DAG) {
17057 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17058 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17059 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17060 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
17061
17062 // Whenever we can lower this as a zext, that instruction is strictly faster
17063 // than any alternative. It also allows us to fold memory operands into the
17064 // shuffle in many cases.
17065 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17066 Zeroable, Subtarget, DAG))
17067 return ZExt;
17068
17069 // Check for being able to broadcast a single element.
17070 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17071 Subtarget, DAG))
17072 return Broadcast;
17073
17074 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17075 Zeroable, Subtarget, DAG))
17076 return Blend;
17077
17078 // Use dedicated unpack instructions for masks that match their pattern.
17079 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, V1, V2, Mask, DAG))
17080 return V;
17081
17082 // Use dedicated pack instructions for masks that match their pattern.
17083 if (SDValue V =
17084 lowerShuffleWithPACK(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17085 return V;
17086
17087 // Try to use lower using a truncation.
17088 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17089 Subtarget, DAG))
17090 return V;
17091
17092 // Try to use shift instructions.
17093 if (SDValue Shift =
17094 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
17095 DAG, /*BitwiseOnly*/ false))
17096 return Shift;
17097
17098 // Try to use byte rotation instructions.
17099 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17100 Subtarget, DAG))
17101 return Rotate;
17102
17103 // Try to use bit rotation instructions.
17104 if (V2.isUndef())
17105 if (SDValue Rotate =
17106 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17107 return Rotate;
17108
17109 // Try to create an in-lane repeating shuffle mask and then shuffle the
17110 // results into the target lanes.
17112 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17113 return V;
17114
17115 // There are no generalized cross-lane shuffle operations available on i8
17116 // element types.
17117 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17118 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17119 // because that should be faster than the variable permute alternatives.
17120 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
17121 return V;
17122
17124 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17125 return V;
17126
17127 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17128 DAG, Subtarget);
17129 }
17130
17131 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17132 Zeroable, Subtarget, DAG))
17133 return PSHUFB;
17134
17135 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17136 if (Subtarget.hasVBMI())
17137 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17138
17139 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17140 // shuffle.
17142 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17143 return Result;
17144
17145 // Try to permute the lanes and then use a per-lane permute.
17147 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17148 return V;
17149
17150 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17151 // by zeroable elements in the remaining 24 elements. Turn this into two
17152 // vmovqb instructions shuffled together.
17153 if (Subtarget.hasVLX())
17154 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17155 Mask, Zeroable, DAG))
17156 return V;
17157
17158 // Try to match an interleave of two v32i8s and lower them as unpck and
17159 // permutes using ymms.
17160 if (!Subtarget.hasAVX512())
17161 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
17162 Mask, DAG))
17163 return V;
17164
17165 // Otherwise fall back on generic lowering.
17166 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17167 Subtarget, DAG);
17168}
17169
17170/// High-level routine to lower various 256-bit x86 vector shuffles.
17171///
17172/// This routine either breaks down the specific type of a 256-bit x86 vector
17173/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17174/// together based on the available instructions.
17176 SDValue V1, SDValue V2, const APInt &Zeroable,
17177 const X86Subtarget &Subtarget,
17178 SelectionDAG &DAG) {
17179 // If we have a single input to the zero element, insert that into V1 if we
17180 // can do so cheaply.
17181 int NumElts = VT.getVectorNumElements();
17182 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17183
17184 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17186 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17187 return Insertion;
17188
17189 // Handle special cases where the lower or upper half is UNDEF.
17190 if (SDValue V =
17191 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17192 return V;
17193
17194 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17195 // can check for those subtargets here and avoid much of the subtarget
17196 // querying in the per-vector-type lowering routines. With AVX1 we have
17197 // essentially *zero* ability to manipulate a 256-bit vector with integer
17198 // types. Since we'll use floating point types there eventually, just
17199 // immediately cast everything to a float and operate entirely in that domain.
17200 if (VT.isInteger() && !Subtarget.hasAVX2()) {
17201 int ElementBits = VT.getScalarSizeInBits();
17202 if (ElementBits < 32) {
17203 // No floating point type available, if we can't use the bit operations
17204 // for masking/blending then decompose into 128-bit vectors.
17205 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17206 Subtarget, DAG))
17207 return V;
17208 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17209 return V;
17210 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17211 }
17212
17213 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17215 V1 = DAG.getBitcast(FpVT, V1);
17216 V2 = DAG.getBitcast(FpVT, V2);
17217 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17218 }
17219
17220 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
17221 V1 = DAG.getBitcast(MVT::v16i16, V1);
17222 V2 = DAG.getBitcast(MVT::v16i16, V2);
17223 return DAG.getBitcast(VT,
17224 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
17225 }
17226
17227 switch (VT.SimpleTy) {
17228 case MVT::v4f64:
17229 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17230 case MVT::v4i64:
17231 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17232 case MVT::v8f32:
17233 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17234 case MVT::v8i32:
17235 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17236 case MVT::v16i16:
17237 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17238 case MVT::v32i8:
17239 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17240
17241 default:
17242 llvm_unreachable("Not a valid 256-bit x86 vector type!");
17243 }
17244}
17245
17246/// Try to lower a vector shuffle as a 128-bit shuffles.
17248 const APInt &Zeroable, SDValue V1, SDValue V2,
17249 const X86Subtarget &Subtarget,
17250 SelectionDAG &DAG) {
17251 assert(VT.getScalarSizeInBits() == 64 &&
17252 "Unexpected element type size for 128bit shuffle.");
17253
17254 // To handle 256 bit vector requires VLX and most probably
17255 // function lowerV2X128VectorShuffle() is better solution.
17256 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
17257
17258 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17259 SmallVector<int, 4> Widened128Mask;
17260 if (!canWidenShuffleElements(Mask, Widened128Mask))
17261 return SDValue();
17262 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
17263
17264 // Try to use an insert into a zero vector.
17265 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17266 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17267 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17268 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17269 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17270 DAG.getVectorIdxConstant(0, DL));
17271 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17272 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17273 DAG.getVectorIdxConstant(0, DL));
17274 }
17275
17276 // Check for patterns which can be matched with a single insert of a 256-bit
17277 // subvector.
17278 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17279 if (OnlyUsesV1 ||
17280 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17281 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17282 SDValue SubVec =
17283 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17284 DAG.getVectorIdxConstant(0, DL));
17285 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17286 DAG.getVectorIdxConstant(4, DL));
17287 }
17288
17289 // See if this is an insertion of the lower 128-bits of V2 into V1.
17290 bool IsInsert = true;
17291 int V2Index = -1;
17292 for (int i = 0; i < 4; ++i) {
17293 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17294 if (Widened128Mask[i] < 0)
17295 continue;
17296
17297 // Make sure all V1 subvectors are in place.
17298 if (Widened128Mask[i] < 4) {
17299 if (Widened128Mask[i] != i) {
17300 IsInsert = false;
17301 break;
17302 }
17303 } else {
17304 // Make sure we only have a single V2 index and its the lowest 128-bits.
17305 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17306 IsInsert = false;
17307 break;
17308 }
17309 V2Index = i;
17310 }
17311 }
17312 if (IsInsert && V2Index >= 0) {
17313 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17314 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17315 DAG.getVectorIdxConstant(0, DL));
17316 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17317 }
17318
17319 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17320 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17321 // possible we at least ensure the lanes stay sequential to help later
17322 // combines.
17323 SmallVector<int, 2> Widened256Mask;
17324 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17325 Widened128Mask.clear();
17326 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17327 }
17328
17329 // Try to lower to vshuf64x2/vshuf32x4.
17330 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17331 int PermMask[4] = {-1, -1, -1, -1};
17332 // Ensure elements came from the same Op.
17333 for (int i = 0; i < 4; ++i) {
17334 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17335 if (Widened128Mask[i] < 0)
17336 continue;
17337
17338 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17339 unsigned OpIndex = i / 2;
17340 if (Ops[OpIndex].isUndef())
17341 Ops[OpIndex] = Op;
17342 else if (Ops[OpIndex] != Op)
17343 return SDValue();
17344
17345 PermMask[i] = Widened128Mask[i] % 4;
17346 }
17347
17348 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17349 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
17350}
17351
17352/// Handle lowering of 8-lane 64-bit floating point shuffles.
17354 const APInt &Zeroable, SDValue V1, SDValue V2,
17355 const X86Subtarget &Subtarget,
17356 SelectionDAG &DAG) {
17357 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17358 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17359 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17360
17361 if (V2.isUndef()) {
17362 // Use low duplicate instructions for masks that match their pattern.
17363 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17364 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17365
17366 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17367 // Non-half-crossing single input shuffles can be lowered with an
17368 // interleaved permutation.
17369 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17370 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17371 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17372 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17373 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17374 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17375 }
17376
17377 SmallVector<int, 4> RepeatedMask;
17378 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17379 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17380 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17381 }
17382
17383 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17384 V2, Subtarget, DAG))
17385 return Shuf128;
17386
17387 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, V1, V2, Mask, DAG))
17388 return Unpck;
17389
17390 // Check if the blend happens to exactly fit that of SHUFPD.
17391 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17392 Zeroable, Subtarget, DAG))
17393 return Op;
17394
17395 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
17396 Subtarget, DAG))
17397 return V;
17398
17399 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17400 Zeroable, Subtarget, DAG))
17401 return Blend;
17402
17403 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17404}
17405
17406/// Handle lowering of 16-lane 32-bit floating point shuffles.
17408 const APInt &Zeroable, SDValue V1, SDValue V2,
17409 const X86Subtarget &Subtarget,
17410 SelectionDAG &DAG) {
17411 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17412 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17413 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17414
17415 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17416 // options to efficiently lower the shuffle.
17417 SmallVector<int, 4> RepeatedMask;
17418 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17419 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17420
17421 // Use even/odd duplicate instructions for masks that match their pattern.
17422 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17423 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17424 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17425 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17426
17427 if (V2.isUndef())
17428 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17429 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17430
17431 // Use dedicated unpack instructions for masks that match their pattern.
17432 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, V1, V2, Mask, DAG))
17433 return V;
17434
17435 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17436 Zeroable, Subtarget, DAG))
17437 return Blend;
17438
17439 // Otherwise, fall back to a SHUFPS sequence.
17440 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17441 }
17442
17443 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17444 Zeroable, Subtarget, DAG))
17445 return Blend;
17446
17448 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17449 return DAG.getBitcast(MVT::v16f32, ZExt);
17450
17451 // Try to create an in-lane repeating shuffle mask and then shuffle the
17452 // results into the target lanes.
17454 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17455 return V;
17456
17457 // If we have a single input shuffle with different shuffle patterns in the
17458 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17459 if (V2.isUndef() &&
17460 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17461 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17462 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17463 }
17464
17465 // If we have AVX512F support, we can use VEXPAND.
17466 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
17467 Zeroable, Subtarget, DAG))
17468 return V;
17469
17470 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17471}
17472
17473/// Handle lowering of 8-lane 64-bit integer shuffles.
17475 const APInt &Zeroable, SDValue V1, SDValue V2,
17476 const X86Subtarget &Subtarget,
17477 SelectionDAG &DAG) {
17478 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17479 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17480 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17481
17482 // Try to use shift instructions if fast.
17483 if (Subtarget.preferLowerShuffleAsShift())
17484 if (SDValue Shift =
17485 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17486 Subtarget, DAG, /*BitwiseOnly*/ true))
17487 return Shift;
17488
17489 if (V2.isUndef()) {
17490 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17491 // can use lower latency instructions that will operate on all four
17492 // 128-bit lanes.
17493 SmallVector<int, 2> Repeated128Mask;
17494 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17495 SmallVector<int, 4> PSHUFDMask;
17496 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17497 return DAG.getBitcast(
17498 MVT::v8i64,
17499 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17500 DAG.getBitcast(MVT::v16i32, V1),
17501 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17502 }
17503
17504 SmallVector<int, 4> Repeated256Mask;
17505 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17506 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17507 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17508 }
17509
17510 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17511 V2, Subtarget, DAG))
17512 return Shuf128;
17513
17514 // Try to use shift instructions.
17515 if (SDValue Shift =
17516 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
17517 DAG, /*BitwiseOnly*/ false))
17518 return Shift;
17519
17520 // Try to use VALIGN.
17521 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17522 Zeroable, Subtarget, DAG))
17523 return Rotate;
17524
17525 // Try to use PALIGNR.
17526 if (Subtarget.hasBWI())
17527 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17528 Subtarget, DAG))
17529 return Rotate;
17530
17531 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, V1, V2, Mask, DAG))
17532 return Unpck;
17533
17534 // If we have AVX512F support, we can use VEXPAND.
17535 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17536 Subtarget, DAG))
17537 return V;
17538
17539 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17540 Zeroable, Subtarget, DAG))
17541 return Blend;
17542
17543 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17544}
17545
17546/// Handle lowering of 16-lane 32-bit integer shuffles.
17548 const APInt &Zeroable, SDValue V1, SDValue V2,
17549 const X86Subtarget &Subtarget,
17550 SelectionDAG &DAG) {
17551 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17552 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17553 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17554
17555 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
17556
17557 // Whenever we can lower this as a zext, that instruction is strictly faster
17558 // than any alternative. It also allows us to fold memory operands into the
17559 // shuffle in many cases.
17561 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17562 return ZExt;
17563
17564 // Try to use shift instructions if fast.
17565 if (Subtarget.preferLowerShuffleAsShift()) {
17566 if (SDValue Shift =
17567 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17568 Subtarget, DAG, /*BitwiseOnly*/ true))
17569 return Shift;
17570 if (NumV2Elements == 0)
17571 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
17572 Subtarget, DAG))
17573 return Rotate;
17574 }
17575
17576 // If the shuffle mask is repeated in each 128-bit lane we can use more
17577 // efficient instructions that mirror the shuffles across the four 128-bit
17578 // lanes.
17579 SmallVector<int, 4> RepeatedMask;
17580 bool Is128BitLaneRepeatedShuffle =
17581 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17582 if (Is128BitLaneRepeatedShuffle) {
17583 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17584 if (V2.isUndef())
17585 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17586 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17587
17588 // Use dedicated unpack instructions for masks that match their pattern.
17589 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, V1, V2, Mask, DAG))
17590 return V;
17591 }
17592
17593 // Try to use shift instructions.
17594 if (SDValue Shift =
17595 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17596 Subtarget, DAG, /*BitwiseOnly*/ false))
17597 return Shift;
17598
17599 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
17600 if (SDValue Rotate =
17601 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
17602 return Rotate;
17603
17604 // Try to use VALIGN.
17605 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17606 Zeroable, Subtarget, DAG))
17607 return Rotate;
17608
17609 // Try to use byte rotation instructions.
17610 if (Subtarget.hasBWI())
17611 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17612 Subtarget, DAG))
17613 return Rotate;
17614
17615 // Assume that a single SHUFPS is faster than using a permv shuffle.
17616 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17617 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17618 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17619 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17620 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17621 CastV1, CastV2, DAG);
17622 return DAG.getBitcast(MVT::v16i32, ShufPS);
17623 }
17624
17625 // Try to create an in-lane repeating shuffle mask and then shuffle the
17626 // results into the target lanes.
17628 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17629 return V;
17630
17631 // If we have AVX512F support, we can use VEXPAND.
17632 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
17633 Zeroable, Subtarget, DAG))
17634 return V;
17635
17636 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17637 Zeroable, Subtarget, DAG))
17638 return Blend;
17639
17640 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17641}
17642
17643/// Handle lowering of 32-lane 16-bit integer shuffles.
17645 const APInt &Zeroable, SDValue V1, SDValue V2,
17646 const X86Subtarget &Subtarget,
17647 SelectionDAG &DAG) {
17648 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17649 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17650 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17651 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17652
17653 // Whenever we can lower this as a zext, that instruction is strictly faster
17654 // than any alternative. It also allows us to fold memory operands into the
17655 // shuffle in many cases.
17657 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17658 return ZExt;
17659
17660 // Use dedicated unpack instructions for masks that match their pattern.
17661 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, V1, V2, Mask, DAG))
17662 return V;
17663
17664 // Use dedicated pack instructions for masks that match their pattern.
17665 if (SDValue V =
17666 lowerShuffleWithPACK(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17667 return V;
17668
17669 // Try to use shift instructions.
17670 if (SDValue Shift =
17671 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17672 Subtarget, DAG, /*BitwiseOnly*/ false))
17673 return Shift;
17674
17675 // Try to use byte rotation instructions.
17676 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17677 Subtarget, DAG))
17678 return Rotate;
17679
17680 if (V2.isUndef()) {
17681 // Try to use bit rotation instructions.
17682 if (SDValue Rotate =
17683 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17684 return Rotate;
17685
17686 SmallVector<int, 8> RepeatedMask;
17687 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17688 // As this is a single-input shuffle, the repeated mask should be
17689 // a strictly valid v8i16 mask that we can pass through to the v8i16
17690 // lowering to handle even the v32 case.
17691 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17692 RepeatedMask, Subtarget, DAG);
17693 }
17694 }
17695
17696 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17697 Zeroable, Subtarget, DAG))
17698 return Blend;
17699
17700 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17701 Zeroable, Subtarget, DAG))
17702 return PSHUFB;
17703
17704 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17705 // shuffle.
17706 if (!V2.isUndef())
17708 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17709 return Result;
17710
17711 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17712}
17713
17714/// Handle lowering of 64-lane 8-bit integer shuffles.
17716 const APInt &Zeroable, SDValue V1, SDValue V2,
17717 const X86Subtarget &Subtarget,
17718 SelectionDAG &DAG) {
17719 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17720 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17721 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17722 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17723
17724 // Whenever we can lower this as a zext, that instruction is strictly faster
17725 // than any alternative. It also allows us to fold memory operands into the
17726 // shuffle in many cases.
17728 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17729 return ZExt;
17730
17731 // Use dedicated unpack instructions for masks that match their pattern.
17732 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, V1, V2, Mask, DAG))
17733 return V;
17734
17735 // Use dedicated pack instructions for masks that match their pattern.
17736 if (SDValue V =
17737 lowerShuffleWithPACK(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17738 return V;
17739
17740 // Try to use shift instructions.
17741 if (SDValue Shift =
17742 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17743 DAG, /*BitwiseOnly*/ false))
17744 return Shift;
17745
17746 // Try to use byte rotation instructions.
17747 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17748 Subtarget, DAG))
17749 return Rotate;
17750
17751 // Try to use bit rotation instructions.
17752 if (V2.isUndef())
17753 if (SDValue Rotate =
17754 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17755 return Rotate;
17756
17757 // Lower as AND if possible.
17758 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17759 Zeroable, Subtarget, DAG))
17760 return Masked;
17761
17762 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17763 Zeroable, Subtarget, DAG))
17764 return PSHUFB;
17765
17766 // Try to create an in-lane repeating shuffle mask and then shuffle the
17767 // results into the target lanes.
17769 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17770 return V;
17771
17773 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17774 return Result;
17775
17776 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17777 Zeroable, Subtarget, DAG))
17778 return Blend;
17779
17780 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
17781 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17782 // PALIGNR will be cheaper than the second PSHUFB+OR.
17783 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
17784 Mask, Subtarget, DAG))
17785 return V;
17786
17787 // If we can't directly blend but can use PSHUFB, that will be better as it
17788 // can both shuffle and set up the inefficient blend.
17789 bool V1InUse, V2InUse;
17790 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
17791 DAG, V1InUse, V2InUse);
17792 }
17793
17794 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17795 // shuffle.
17796 if (!V2.isUndef())
17798 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17799 return Result;
17800
17801 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17802 if (Subtarget.hasVBMI())
17803 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17804
17805 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17806}
17807
17808/// High-level routine to lower various 512-bit x86 vector shuffles.
17809///
17810/// This routine either breaks down the specific type of a 512-bit x86 vector
17811/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17812/// together based on the available instructions.
17814 MVT VT, SDValue V1, SDValue V2,
17815 const APInt &Zeroable,
17816 const X86Subtarget &Subtarget,
17817 SelectionDAG &DAG) {
17818 assert(Subtarget.hasAVX512() &&
17819 "Cannot lower 512-bit vectors w/ basic ISA!");
17820
17821 // If we have a single input to the zero element, insert that into V1 if we
17822 // can do so cheaply.
17823 int NumElts = Mask.size();
17824 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17825
17826 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17828 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17829 return Insertion;
17830
17831 // Handle special cases where the lower or upper half is UNDEF.
17832 if (SDValue V =
17833 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17834 return V;
17835
17836 // Check for being able to broadcast a single element.
17837 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17838 Subtarget, DAG))
17839 return Broadcast;
17840
17841 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17842 // Try using bit ops for masking and blending before falling back to
17843 // splitting.
17844 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17845 Subtarget, DAG))
17846 return V;
17847 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17848 return V;
17849
17850 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17851 }
17852
17853 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17854 if (!Subtarget.hasBWI())
17855 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17856 /*SimpleOnly*/ false);
17857
17858 V1 = DAG.getBitcast(MVT::v32i16, V1);
17859 V2 = DAG.getBitcast(MVT::v32i16, V2);
17860 return DAG.getBitcast(VT,
17861 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
17862 }
17863
17864 // Dispatch to each element type for lowering. If we don't have support for
17865 // specific element type shuffles at 512 bits, immediately split them and
17866 // lower them. Each lowering routine of a given type is allowed to assume that
17867 // the requisite ISA extensions for that element type are available.
17868 switch (VT.SimpleTy) {
17869 case MVT::v8f64:
17870 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17871 case MVT::v16f32:
17872 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17873 case MVT::v8i64:
17874 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17875 case MVT::v16i32:
17876 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17877 case MVT::v32i16:
17878 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17879 case MVT::v64i8:
17880 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17881
17882 default:
17883 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17884 }
17885}
17886
17888 MVT VT, SDValue V1, SDValue V2,
17889 const X86Subtarget &Subtarget,
17890 SelectionDAG &DAG) {
17891 // Shuffle should be unary.
17892 if (!V2.isUndef())
17893 return SDValue();
17894
17895 int ShiftAmt = -1;
17896 int NumElts = Mask.size();
17897 for (int i = 0; i != NumElts; ++i) {
17898 int M = Mask[i];
17899 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17900 "Unexpected mask index.");
17901 if (M < 0)
17902 continue;
17903
17904 // The first non-undef element determines our shift amount.
17905 if (ShiftAmt < 0) {
17906 ShiftAmt = M - i;
17907 // Need to be shifting right.
17908 if (ShiftAmt <= 0)
17909 return SDValue();
17910 }
17911 // All non-undef elements must shift by the same amount.
17912 if (ShiftAmt != M - i)
17913 return SDValue();
17914 }
17915 assert(ShiftAmt >= 0 && "All undef?");
17916
17917 // Great we found a shift right.
17918 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17919 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17920 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17921 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17922 DAG.getVectorIdxConstant(0, DL));
17923}
17924
17925// Determine if this shuffle can be implemented with a KSHIFT instruction.
17926// Returns the shift amount if possible or -1 if not. This is a simplified
17927// version of matchShuffleAsShift.
17928static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17929 int MaskOffset, const APInt &Zeroable) {
17930 int Size = Mask.size();
17931
17932 auto CheckZeros = [&](int Shift, bool Left) {
17933 for (int j = 0; j < Shift; ++j)
17934 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17935 return false;
17936
17937 return true;
17938 };
17939
17940 auto MatchShift = [&](int Shift, bool Left) {
17941 unsigned Pos = Left ? Shift : 0;
17942 unsigned Low = Left ? 0 : Shift;
17943 unsigned Len = Size - Shift;
17944 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17945 };
17946
17947 for (int Shift = 1; Shift != Size; ++Shift)
17948 for (bool Left : {true, false})
17949 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17951 return Shift;
17952 }
17953
17954 return -1;
17955}
17956
17957
17958// Lower vXi1 vector shuffles.
17959// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17960// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17961// vector, shuffle and then truncate it back.
17963 MVT VT, SDValue V1, SDValue V2,
17964 const APInt &Zeroable,
17965 const X86Subtarget &Subtarget,
17966 SelectionDAG &DAG) {
17967 assert(Subtarget.hasAVX512() &&
17968 "Cannot lower 512-bit vectors w/o basic ISA!");
17969
17970 int NumElts = Mask.size();
17971 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17972
17973 // Try to recognize shuffles that are just padding a subvector with zeros.
17974 int SubvecElts = 0;
17975 int Src = -1;
17976 for (int i = 0; i != NumElts; ++i) {
17977 if (Mask[i] >= 0) {
17978 // Grab the source from the first valid mask. All subsequent elements need
17979 // to use this same source.
17980 if (Src < 0)
17981 Src = Mask[i] / NumElts;
17982 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
17983 break;
17984 }
17985
17986 ++SubvecElts;
17987 }
17988 assert(SubvecElts != NumElts && "Identity shuffle?");
17989
17990 // Clip to a power 2.
17991 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
17992
17993 // Make sure the number of zeroable bits in the top at least covers the bits
17994 // not covered by the subvector.
17995 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
17996 assert(Src >= 0 && "Expected a source!");
17997 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
17998 SDValue Extract =
17999 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2,
18000 DAG.getVectorIdxConstant(0, DL));
18001 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18002 DAG.getConstant(0, DL, VT), Extract,
18003 DAG.getVectorIdxConstant(0, DL));
18004 }
18005
18006 // Try a simple shift right with undef elements. Later we'll try with zeros.
18007 if (SDValue Shift =
18008 lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG))
18009 return Shift;
18010
18011 // Try to match KSHIFTs.
18012 unsigned Offset = 0;
18013 for (SDValue V : {V1, V2}) {
18014 unsigned Opcode;
18015 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18016 if (ShiftAmt >= 0) {
18017 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
18018 MVT WideVT = Res.getSimpleValueType();
18019 // Widened right shifts need two shifts to ensure we shift in zeroes.
18020 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18021 int WideElts = WideVT.getVectorNumElements();
18022 // Shift left to put the original vector in the MSBs of the new size.
18023 Res =
18024 DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18025 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18026 // Increase the shift amount to account for the left shift.
18027 ShiftAmt += WideElts - NumElts;
18028 }
18029
18030 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18031 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18032 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18033 DAG.getVectorIdxConstant(0, DL));
18034 }
18035 Offset += NumElts; // Increment for next iteration.
18036 }
18037
18038 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
18039 // ops instead.
18040 // TODO: What other unary shuffles would benefit from this?
18041 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
18042 SDValue Op0 = V1.getOperand(0);
18043 SDValue Op1 = V1.getOperand(1);
18045 EVT OpVT = Op0.getValueType();
18046 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
18047 return DAG.getSetCC(
18048 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
18049 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
18050 }
18051
18052 MVT ExtVT;
18053 switch (VT.SimpleTy) {
18054 default:
18055 llvm_unreachable("Expected a vector of i1 elements");
18056 case MVT::v2i1:
18057 ExtVT = MVT::v2i64;
18058 break;
18059 case MVT::v4i1:
18060 ExtVT = MVT::v4i32;
18061 break;
18062 case MVT::v8i1:
18063 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18064 // shuffle.
18065 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18066 break;
18067 case MVT::v16i1:
18068 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18069 // 256-bit operation available.
18070 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18071 break;
18072 case MVT::v32i1:
18073 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18074 // 256-bit operation available.
18075 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
18076 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18077 break;
18078 case MVT::v64i1:
18079 // Fall back to scalarization. FIXME: We can do better if the shuffle
18080 // can be partitioned cleanly.
18081 if (!Subtarget.useBWIRegs())
18082 return SDValue();
18083 ExtVT = MVT::v64i8;
18084 break;
18085 }
18086
18087 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18088 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18089
18090 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18091 // i1 was sign extended we can use X86ISD::CVT2MASK.
18092 int NumElems = VT.getVectorNumElements();
18093 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18094 (Subtarget.hasDQI() && (NumElems < 32)))
18095 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18096 Shuffle, ISD::SETGT);
18097
18098 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18099}
18100
18101/// Helper function that returns true if the shuffle mask should be
18102/// commuted to improve canonicalization.
18104 int NumElements = Mask.size();
18105
18106 int NumV1Elements = 0, NumV2Elements = 0;
18107 for (int M : Mask)
18108 if (M < 0)
18109 continue;
18110 else if (M < NumElements)
18111 ++NumV1Elements;
18112 else
18113 ++NumV2Elements;
18114
18115 // Commute the shuffle as needed such that more elements come from V1 than
18116 // V2. This allows us to match the shuffle pattern strictly on how many
18117 // elements come from V1 without handling the symmetric cases.
18118 if (NumV2Elements > NumV1Elements)
18119 return true;
18120
18121 assert(NumV1Elements > 0 && "No V1 indices");
18122
18123 if (NumV2Elements == 0)
18124 return false;
18125
18126 // When the number of V1 and V2 elements are the same, try to minimize the
18127 // number of uses of V2 in the low half of the vector. When that is tied,
18128 // ensure that the sum of indices for V1 is equal to or lower than the sum
18129 // indices for V2. When those are equal, try to ensure that the number of odd
18130 // indices for V1 is lower than the number of odd indices for V2.
18131 if (NumV1Elements == NumV2Elements) {
18132 int LowV1Elements = 0, LowV2Elements = 0;
18133 for (int M : Mask.slice(0, NumElements / 2))
18134 if (M >= NumElements)
18135 ++LowV2Elements;
18136 else if (M >= 0)
18137 ++LowV1Elements;
18138 if (LowV2Elements > LowV1Elements)
18139 return true;
18140 if (LowV2Elements == LowV1Elements) {
18141 int SumV1Indices = 0, SumV2Indices = 0;
18142 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18143 if (Mask[i] >= NumElements)
18144 SumV2Indices += i;
18145 else if (Mask[i] >= 0)
18146 SumV1Indices += i;
18147 if (SumV2Indices < SumV1Indices)
18148 return true;
18149 if (SumV2Indices == SumV1Indices) {
18150 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18151 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18152 if (Mask[i] >= NumElements)
18153 NumV2OddIndices += i % 2;
18154 else if (Mask[i] >= 0)
18155 NumV1OddIndices += i % 2;
18156 if (NumV2OddIndices < NumV1OddIndices)
18157 return true;
18158 }
18159 }
18160 }
18161
18162 return false;
18163}
18164
18166 const X86Subtarget &Subtarget) {
18167 if (!Subtarget.hasAVX512())
18168 return false;
18169
18170 if (!V.getValueType().isSimple())
18171 return false;
18172
18173 MVT VT = V.getSimpleValueType().getScalarType();
18174 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
18175 return false;
18176
18177 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
18178 // are preferable to blendw/blendvb/masked-mov.
18179 if ((VT == MVT::i16 || VT == MVT::i8) &&
18180 V.getSimpleValueType().getSizeInBits() < 512)
18181 return false;
18182
18183 auto HasMaskOperation = [&](SDValue V) {
18184 // TODO: Currently we only check limited opcode. We probably extend
18185 // it to all binary operation by checking TLI.isBinOp().
18186 switch (V->getOpcode()) {
18187 default:
18188 return false;
18189 case ISD::ADD:
18190 case ISD::SUB:
18191 case ISD::AND:
18192 case ISD::XOR:
18193 case ISD::OR:
18194 case ISD::SMAX:
18195 case ISD::SMIN:
18196 case ISD::UMAX:
18197 case ISD::UMIN:
18198 case ISD::ABS:
18199 case ISD::SHL:
18200 case ISD::SRL:
18201 case ISD::SRA:
18202 case ISD::MUL:
18203 break;
18204 }
18205 if (!V->hasOneUse())
18206 return false;
18207
18208 return true;
18209 };
18210
18211 if (HasMaskOperation(V))
18212 return true;
18213
18214 return false;
18215}
18216
18217// Forward declaration.
18220 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
18221 const X86Subtarget &Subtarget);
18222
18223 /// Top-level lowering for x86 vector shuffles.
18224///
18225/// This handles decomposition, canonicalization, and lowering of all x86
18226/// vector shuffles. Most of the specific lowering strategies are encapsulated
18227/// above in helper routines. The canonicalization attempts to widen shuffles
18228/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18229/// s.t. only one of the two inputs needs to be tested, etc.
18231 SelectionDAG &DAG) {
18233 ArrayRef<int> OrigMask = SVOp->getMask();
18234 SDValue V1 = Op.getOperand(0);
18235 SDValue V2 = Op.getOperand(1);
18236 MVT VT = Op.getSimpleValueType();
18237 int NumElements = VT.getVectorNumElements();
18238 SDLoc DL(Op);
18239 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18240
18241 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
18242 "Can't lower MMX shuffles");
18243
18244 bool V1IsUndef = V1.isUndef();
18245 bool V2IsUndef = V2.isUndef();
18246 if (V1IsUndef && V2IsUndef)
18247 return DAG.getUNDEF(VT);
18248
18249 // When we create a shuffle node we put the UNDEF node to second operand,
18250 // but in some cases the first operand may be transformed to UNDEF.
18251 // In this case we should just commute the node.
18252 if (V1IsUndef)
18253 return DAG.getCommutedVectorShuffle(*SVOp);
18254
18255 // Check for non-undef masks pointing at an undef vector and make the masks
18256 // undef as well. This makes it easier to match the shuffle based solely on
18257 // the mask.
18258 if (V2IsUndef &&
18259 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18260 SmallVector<int, 8> NewMask(OrigMask);
18261 for (int &M : NewMask)
18262 if (M >= NumElements)
18263 M = -1;
18264 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18265 }
18266
18267 // Check for illegal shuffle mask element index values.
18268 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18269 (void)MaskUpperLimit;
18270 assert(llvm::all_of(OrigMask,
18271 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
18272 "Out of bounds shuffle index");
18273
18274 // We actually see shuffles that are entirely re-arrangements of a set of
18275 // zero inputs. This mostly happens while decomposing complex shuffles into
18276 // simple ones. Directly lower these as a buildvector of zeros.
18277 APInt KnownUndef, KnownZero;
18278 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18279
18280 APInt Zeroable = KnownUndef | KnownZero;
18281 if (Zeroable.isAllOnes())
18282 return getZeroVector(VT, Subtarget, DAG, DL);
18283
18284 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18285
18286 // Try to collapse shuffles into using a vector type with fewer elements but
18287 // wider element types. We cap this to not form integers or floating point
18288 // elements wider than 64 bits. It does not seem beneficial to form i128
18289 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18290 SmallVector<int, 16> WidenedMask;
18291 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18292 !canCombineAsMaskOperation(V1, Subtarget) &&
18293 !canCombineAsMaskOperation(V2, Subtarget) &&
18294 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18295 // Shuffle mask widening should not interfere with a broadcast opportunity
18296 // by obfuscating the operands with bitcasts.
18297 // TODO: Avoid lowering directly from this top-level function: make this
18298 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18299 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18300 Subtarget, DAG))
18301 return Broadcast;
18302
18303 MVT NewEltVT = VT.isFloatingPoint()
18306 int NewNumElts = NumElements / 2;
18307 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18308 // Make sure that the new vector type is legal. For example, v2f64 isn't
18309 // legal on SSE1.
18310 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18311 if (V2IsZero) {
18312 // Modify the new Mask to take all zeros from the all-zero vector.
18313 // Choose indices that are blend-friendly.
18314 bool UsedZeroVector = false;
18315 assert(is_contained(WidenedMask, SM_SentinelZero) &&
18316 "V2's non-undef elements are used?!");
18317 for (int i = 0; i != NewNumElts; ++i)
18318 if (WidenedMask[i] == SM_SentinelZero) {
18319 WidenedMask[i] = i + NewNumElts;
18320 UsedZeroVector = true;
18321 }
18322 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18323 // some elements to be undef.
18324 if (UsedZeroVector)
18325 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18326 }
18327 V1 = DAG.getBitcast(NewVT, V1);
18328 V2 = DAG.getBitcast(NewVT, V2);
18329 return DAG.getBitcast(
18330 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18331 }
18332 }
18333
18334 SmallVector<SDValue> Ops = {V1, V2};
18335 SmallVector<int> Mask(OrigMask);
18336
18337 // Canonicalize the shuffle with any horizontal ops inputs.
18338 // NOTE: This may update Ops and Mask.
18340 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
18341 return DAG.getBitcast(VT, HOp);
18342
18343 V1 = DAG.getBitcast(VT, Ops[0]);
18344 V2 = DAG.getBitcast(VT, Ops[1]);
18345 assert(NumElements == (int)Mask.size() &&
18346 "canonicalizeShuffleMaskWithHorizOp "
18347 "shouldn't alter the shuffle mask size");
18348
18349 // Canonicalize zeros/ones/fp splat constants to ensure no undefs.
18350 // These will be materialized uniformly anyway, so make splat matching easier.
18351 // TODO: Allow all int constants?
18352 auto CanonicalizeConstant = [VT, &DL, &DAG](SDValue V) {
18353 if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
18354 BitVector Undefs;
18355 if (SDValue Splat = BV->getSplatValue(&Undefs)) {
18356 if (Undefs.any() &&
18359 V = DAG.getBitcast(VT, DAG.getSplat(BV->getValueType(0), DL, Splat));
18360 }
18361 }
18362 }
18363 return V;
18364 };
18365 V1 = CanonicalizeConstant(V1);
18366 V2 = CanonicalizeConstant(V2);
18367
18368 // Commute the shuffle if it will improve canonicalization.
18371 std::swap(V1, V2);
18372 }
18373
18374 // For each vector width, delegate to a specialized lowering routine.
18375 if (VT.is128BitVector())
18376 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18377
18378 if (VT.is256BitVector())
18379 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18380
18381 if (VT.is512BitVector())
18382 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18383
18384 if (Is1BitVector)
18385 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18386
18387 llvm_unreachable("Unimplemented!");
18388}
18389
18390// As legal vpcompress instructions depend on various AVX512 extensions, try to
18391// convert illegal vector sizes to legal ones to avoid expansion.
18393 SelectionDAG &DAG) {
18394 assert(Subtarget.hasAVX512() &&
18395 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
18396
18397 SDLoc DL(Op);
18398 SDValue Vec = Op.getOperand(0);
18399 SDValue Mask = Op.getOperand(1);
18400 SDValue Passthru = Op.getOperand(2);
18401
18402 EVT VecVT = Vec.getValueType();
18403 EVT ElementVT = VecVT.getVectorElementType();
18404 unsigned NumElements = VecVT.getVectorNumElements();
18405 unsigned NumVecBits = VecVT.getFixedSizeInBits();
18406 unsigned NumElementBits = ElementVT.getFixedSizeInBits();
18407
18408 // 128- and 256-bit vectors with <= 16 elements can be converted to and
18409 // compressed as 512-bit vectors in AVX512F.
18410 if (NumVecBits != 128 && NumVecBits != 256)
18411 return SDValue();
18412
18413 if (NumElementBits == 32 || NumElementBits == 64) {
18414 unsigned NumLargeElements = 512 / NumElementBits;
18415 MVT LargeVecVT =
18416 MVT::getVectorVT(ElementVT.getSimpleVT(), NumLargeElements);
18417 MVT LargeMaskVT = MVT::getVectorVT(MVT::i1, NumLargeElements);
18418
18419 Vec = widenSubVector(LargeVecVT, Vec, /*ZeroNewElements=*/false, Subtarget,
18420 DAG, DL);
18421 Mask = widenSubVector(LargeMaskVT, Mask, /*ZeroNewElements=*/true,
18422 Subtarget, DAG, DL);
18423 Passthru = Passthru.isUndef() ? DAG.getUNDEF(LargeVecVT)
18424 : widenSubVector(LargeVecVT, Passthru,
18425 /*ZeroNewElements=*/false,
18426 Subtarget, DAG, DL);
18427
18428 SDValue Compressed =
18429 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18430 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Compressed,
18431 DAG.getConstant(0, DL, MVT::i64));
18432 }
18433
18434 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
18435 VecVT == MVT::v16i16) {
18436 MVT LageElementVT = MVT::getIntegerVT(512 / NumElements);
18437 EVT LargeVecVT = MVT::getVectorVT(LageElementVT, NumElements);
18438
18439 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Vec);
18440 Passthru = Passthru.isUndef()
18441 ? DAG.getUNDEF(LargeVecVT)
18442 : DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Passthru);
18443
18444 SDValue Compressed =
18445 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18446 return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);
18447 }
18448
18449 return SDValue();
18450}
18451
18452/// Try to lower a VSELECT instruction to a vector shuffle.
18454 const X86Subtarget &Subtarget,
18455 SelectionDAG &DAG) {
18456 SDValue Cond = Op.getOperand(0);
18457 SDValue LHS = Op.getOperand(1);
18458 SDValue RHS = Op.getOperand(2);
18459 MVT VT = Op.getSimpleValueType();
18460
18461 // Only non-legal VSELECTs reach this lowering, convert those into generic
18462 // shuffles and re-use the shuffle lowering path for blends.
18466 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18467 }
18468
18469 return SDValue();
18470}
18471
18472SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18473 SDValue Cond = Op.getOperand(0);
18474 SDValue LHS = Op.getOperand(1);
18475 SDValue RHS = Op.getOperand(2);
18476
18477 SDLoc dl(Op);
18478 MVT VT = Op.getSimpleValueType();
18479 if (isSoftF16(VT, Subtarget)) {
18480 MVT NVT = VT.changeVectorElementTypeToInteger();
18481 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
18482 DAG.getBitcast(NVT, LHS),
18483 DAG.getBitcast(NVT, RHS)));
18484 }
18485
18486 // A vselect where all conditions and data are constants can be optimized into
18487 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18491 return SDValue();
18492
18493 // Try to lower this to a blend-style vector shuffle. This can handle all
18494 // constant condition cases.
18495 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18496 return BlendOp;
18497
18498 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18499 // with patterns on the mask registers on AVX-512.
18500 MVT CondVT = Cond.getSimpleValueType();
18501 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18502 if (CondEltSize == 1)
18503 return Op;
18504
18505 // Variable blends are only legal from SSE4.1 onward.
18506 if (!Subtarget.hasSSE41())
18507 return SDValue();
18508
18509 unsigned EltSize = VT.getScalarSizeInBits();
18510 unsigned NumElts = VT.getVectorNumElements();
18511
18512 // Expand v32i16/v64i8 without BWI.
18513 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18514 return SDValue();
18515
18516 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18517 // into an i1 condition so that we can use the mask-based 512-bit blend
18518 // instructions.
18519 if (VT.getSizeInBits() == 512) {
18520 // Build a mask by testing the condition against zero.
18521 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18522 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18523 DAG.getConstant(0, dl, CondVT),
18524 ISD::SETNE);
18525 // Now return a new VSELECT using the mask.
18526 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18527 }
18528
18529 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18530 if (CondEltSize != EltSize) {
18531 // If we don't have a sign splat, rely on the expansion.
18532 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18533 return SDValue();
18534
18535 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18536 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18537 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18538 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18539 }
18540
18541 // v16i16/v32i8 selects without AVX2, if the condition and another operand
18542 // are free to split, then better to split before expanding the
18543 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
18544 // TODO: This is very similar to narrowVectorSelect.
18545 // TODO: Add Load splitting to isFreeToSplitVector ?
18546 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
18547 !Subtarget.hasXOP()) {
18548 bool FreeCond = isFreeToSplitVector(Cond, DAG);
18549 bool FreeLHS = isFreeToSplitVector(LHS, DAG) ||
18550 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
18551 bool FreeRHS = isFreeToSplitVector(RHS, DAG) ||
18552 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
18553 if (FreeCond && (FreeLHS || FreeRHS))
18554 return splitVectorOp(Op, DAG, dl);
18555 }
18556
18557 // Only some types will be legal on some subtargets. If we can emit a legal
18558 // VSELECT-matching blend, return Op, and but if we need to expand, return
18559 // a null value.
18560 switch (VT.SimpleTy) {
18561 default:
18562 // Most of the vector types have blends past SSE4.1.
18563 return Op;
18564
18565 case MVT::v32i8:
18566 // The byte blends for AVX vectors were introduced only in AVX2.
18567 if (Subtarget.hasAVX2())
18568 return Op;
18569
18570 return SDValue();
18571
18572 case MVT::v8i16:
18573 case MVT::v16i16:
18574 case MVT::v8f16:
18575 case MVT::v16f16: {
18576 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18577 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18578 Cond = DAG.getBitcast(CastVT, Cond);
18579 LHS = DAG.getBitcast(CastVT, LHS);
18580 RHS = DAG.getBitcast(CastVT, RHS);
18581 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18582 return DAG.getBitcast(VT, Select);
18583 }
18584 }
18585}
18586
18588 MVT VT = Op.getSimpleValueType();
18589 SDValue Vec = Op.getOperand(0);
18590 SDValue Idx = Op.getOperand(1);
18591 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
18592 SDLoc dl(Op);
18593
18595 return SDValue();
18596
18597 if (VT.getSizeInBits() == 8) {
18598 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18599 // we're going to zero extend the register or fold the store.
18602 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18603 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18604 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18605
18606 unsigned IdxVal = Idx->getAsZExtVal();
18607 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18608 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18609 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18610 }
18611
18612 if (VT == MVT::f32) {
18613 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18614 // the result back to FR32 register. It's only worth matching if the
18615 // result has a single use which is a store or a bitcast to i32. And in
18616 // the case of a store, it's not worth it if the index is a constant 0,
18617 // because a MOVSSmr can be used instead, which is smaller and faster.
18618 if (!Op.hasOneUse())
18619 return SDValue();
18620 SDNode *User = *Op.getNode()->user_begin();
18621 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18622 (User->getOpcode() != ISD::BITCAST ||
18623 User->getValueType(0) != MVT::i32))
18624 return SDValue();
18625 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18626 DAG.getBitcast(MVT::v4i32, Vec), Idx);
18627 return DAG.getBitcast(MVT::f32, Extract);
18628 }
18629
18630 if (VT == MVT::i32 || VT == MVT::i64)
18631 return Op;
18632
18633 return SDValue();
18634}
18635
18636/// Extract one bit from mask vector, like v16i1 or v8i1.
18637/// AVX-512 feature.
18639 const X86Subtarget &Subtarget) {
18640 SDValue Vec = Op.getOperand(0);
18641 SDLoc dl(Vec);
18642 MVT VecVT = Vec.getSimpleValueType();
18643 SDValue Idx = Op.getOperand(1);
18644 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18645 MVT EltVT = Op.getSimpleValueType();
18646
18647 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
18648 "Unexpected vector type in ExtractBitFromMaskVector");
18649
18650 // variable index can't be handled in mask registers,
18651 // extend vector to VR512/128
18652 if (!IdxC) {
18653 unsigned NumElts = VecVT.getVectorNumElements();
18654 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18655 // than extending to 128/256bit.
18656 if (NumElts == 1) {
18657 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18659 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
18660 }
18661 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18662 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18663 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18664 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18665 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18666 }
18667
18668 unsigned IdxVal = IdxC->getZExtValue();
18669 if (IdxVal == 0) // the operation is legal
18670 return Op;
18671
18672 // Extend to natively supported kshift.
18673 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18674
18675 // Use kshiftr instruction to move to the lower element.
18676 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18677 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18678
18679 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18680 DAG.getVectorIdxConstant(0, dl));
18681}
18682
18683// Helper to find all the extracted elements from a vector.
18685 MVT VT = N->getSimpleValueType(0);
18686 unsigned NumElts = VT.getVectorNumElements();
18687 APInt DemandedElts = APInt::getZero(NumElts);
18688 for (SDNode *User : N->users()) {
18689 switch (User->getOpcode()) {
18690 case X86ISD::PEXTRB:
18691 case X86ISD::PEXTRW:
18694 DemandedElts.setAllBits();
18695 return DemandedElts;
18696 }
18697 DemandedElts.setBit(User->getConstantOperandVal(1));
18698 break;
18699 case ISD::BITCAST: {
18700 if (!User->getValueType(0).isSimple() ||
18701 !User->getValueType(0).isVector()) {
18702 DemandedElts.setAllBits();
18703 return DemandedElts;
18704 }
18705 APInt DemandedSrcElts = getExtractedDemandedElts(User);
18706 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
18707 break;
18708 }
18709 default:
18710 DemandedElts.setAllBits();
18711 return DemandedElts;
18712 }
18713 }
18714 return DemandedElts;
18715}
18716
18717SDValue
18718X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18719 SelectionDAG &DAG) const {
18720 SDLoc dl(Op);
18721 SDValue Vec = Op.getOperand(0);
18722 MVT VecVT = Vec.getSimpleValueType();
18723 SDValue Idx = Op.getOperand(1);
18724 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18725
18726 if (VecVT.getVectorElementType() == MVT::i1)
18727 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18728
18729 if (!IdxC) {
18730 // Its more profitable to go through memory (1 cycles throughput)
18731 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
18732 // IACA tool was used to get performance estimation
18733 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18734 //
18735 // example : extractelement <16 x i8> %a, i32 %i
18736 //
18737 // Block Throughput: 3.00 Cycles
18738 // Throughput Bottleneck: Port5
18739 //
18740 // | Num Of | Ports pressure in cycles | |
18741 // | Uops | 0 - DV | 5 | 6 | 7 | |
18742 // ---------------------------------------------
18743 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18744 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18745 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18746 // Total Num Of Uops: 4
18747 //
18748 //
18749 // Block Throughput: 1.00 Cycles
18750 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18751 //
18752 // | | Ports pressure in cycles | |
18753 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18754 // ---------------------------------------------------------
18755 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18756 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18757 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18758 // Total Num Of Uops: 4
18759
18760 return SDValue();
18761 }
18762
18763 unsigned IdxVal = IdxC->getZExtValue();
18764
18765 // If this is a 256-bit vector result, first extract the 128-bit vector and
18766 // then extract the element from the 128-bit vector.
18767 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18768 // Get the 128-bit vector.
18769 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18770 MVT EltVT = VecVT.getVectorElementType();
18771
18772 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18773 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18774
18775 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18776 // this can be done with a mask.
18777 IdxVal &= ElemsPerChunk - 1;
18778 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18779 DAG.getVectorIdxConstant(IdxVal, dl));
18780 }
18781
18782 assert(VecVT.is128BitVector() && "Unexpected vector length");
18783
18784 MVT VT = Op.getSimpleValueType();
18785
18786 if (VT == MVT::i16) {
18787 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18788 // we're going to zero extend the register or fold the store (SSE41 only).
18789 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
18790 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
18791 if (Subtarget.hasFP16())
18792 return Op;
18793
18794 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18795 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18796 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18797 }
18798
18799 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18800 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18801 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18802 }
18803
18804 if (Subtarget.hasSSE41())
18805 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18806 return Res;
18807
18808 // Only extract a single element from a v16i8 source - determine the common
18809 // DWORD/WORD that all extractions share, and extract the sub-byte.
18810 // TODO: Add QWORD MOVQ extraction?
18811 if (VT == MVT::i8) {
18812 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
18813 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
18814
18815 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18816 int DWordIdx = IdxVal / 4;
18817 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18818 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18819 DAG.getBitcast(MVT::v4i32, Vec),
18820 DAG.getVectorIdxConstant(DWordIdx, dl));
18821 int ShiftVal = (IdxVal % 4) * 8;
18822 if (ShiftVal != 0)
18823 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18824 DAG.getConstant(ShiftVal, dl, MVT::i8));
18825 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18826 }
18827
18828 int WordIdx = IdxVal / 2;
18829 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18830 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18831 DAG.getBitcast(MVT::v8i16, Vec),
18832 DAG.getVectorIdxConstant(WordIdx, dl));
18833 int ShiftVal = (IdxVal % 2) * 8;
18834 if (ShiftVal != 0)
18835 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18836 DAG.getConstant(ShiftVal, dl, MVT::i8));
18837 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18838 }
18839 }
18840
18841 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
18842 if (IdxVal == 0)
18843 return Op;
18844
18845 // Shuffle the element to the lowest element, then movss or movsh.
18846 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
18847 Mask[0] = static_cast<int>(IdxVal);
18848 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18849 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18850 DAG.getVectorIdxConstant(0, dl));
18851 }
18852
18853 if (VT.getSizeInBits() == 64) {
18854 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18855 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18856 // to match extract_elt for f64.
18857 if (IdxVal == 0)
18858 return Op;
18859
18860 // UNPCKHPD the element to the lowest double word, then movsd.
18861 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18862 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18863 int Mask[2] = { 1, -1 };
18864 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18865 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18866 DAG.getVectorIdxConstant(0, dl));
18867 }
18868
18869 return SDValue();
18870}
18871
18872/// Insert one bit to mask vector, like v16i1 or v8i1.
18873/// AVX-512 feature.
18875 const X86Subtarget &Subtarget) {
18876 SDLoc dl(Op);
18877 SDValue Vec = Op.getOperand(0);
18878 SDValue Elt = Op.getOperand(1);
18879 SDValue Idx = Op.getOperand(2);
18880 MVT VecVT = Vec.getSimpleValueType();
18881
18882 if (!isa<ConstantSDNode>(Idx)) {
18883 // Non constant index. Extend source and destination,
18884 // insert element and then truncate the result.
18885 unsigned NumElts = VecVT.getVectorNumElements();
18886 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18887 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18888 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18889 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18890 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18891 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18892 }
18893
18894 // Copy into a k-register, extract to v1i1 and insert_subvector.
18895 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18896 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18897}
18898
18899SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18900 SelectionDAG &DAG) const {
18901 MVT VT = Op.getSimpleValueType();
18902 MVT EltVT = VT.getVectorElementType();
18903 unsigned NumElts = VT.getVectorNumElements();
18904 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18905
18906 if (EltVT == MVT::i1)
18907 return InsertBitToMaskVector(Op, DAG, Subtarget);
18908
18909 SDLoc dl(Op);
18910 SDValue N0 = Op.getOperand(0);
18911 SDValue N1 = Op.getOperand(1);
18912 SDValue N2 = Op.getOperand(2);
18913 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18914
18915 if (EltVT == MVT::bf16) {
18916 MVT IVT = VT.changeVectorElementTypeToInteger();
18917 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
18918 DAG.getBitcast(IVT, N0),
18919 DAG.getBitcast(MVT::i16, N1), N2);
18920 return DAG.getBitcast(VT, Res);
18921 }
18922
18923 if (!N2C) {
18924 // Variable insertion indices, usually we're better off spilling to stack,
18925 // but AVX512 can use a variable compare+select by comparing against all
18926 // possible vector indices, and FP insertion has less gpr->simd traffic.
18927 if (!(Subtarget.hasBWI() ||
18928 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18929 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18930 return SDValue();
18931
18932 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18933 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18934 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18935 return SDValue();
18936
18937 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18938 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18939 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18940
18941 SmallVector<SDValue, 16> RawIndices;
18942 for (unsigned I = 0; I != NumElts; ++I)
18943 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18944 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18945
18946 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18947 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18949 }
18950
18951 if (N2C->getAPIntValue().uge(NumElts))
18952 return SDValue();
18953 uint64_t IdxVal = N2C->getZExtValue();
18954
18955 bool IsZeroElt = X86::isZeroNode(N1);
18956 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18957
18958 if (IsZeroElt || IsAllOnesElt) {
18959 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18960 // We don't deal with i8 0 since it appears to be handled elsewhere.
18961 if (IsAllOnesElt &&
18962 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
18963 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
18964 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
18965 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
18966 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18967 CstVectorElts[IdxVal] = OnesCst;
18968 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18969 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18970 }
18971 // See if we can do this more efficiently with a blend shuffle with a
18972 // rematerializable vector.
18973 if (Subtarget.hasSSE41() &&
18974 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
18975 SmallVector<int, 8> BlendMask;
18976 for (unsigned i = 0; i != NumElts; ++i)
18977 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18978 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
18979 : getOnesVector(VT, DAG, dl);
18980 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
18981 }
18982 }
18983
18984 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18985 // into that, and then insert the subvector back into the result.
18986 if (VT.is256BitVector() || VT.is512BitVector()) {
18987 // With a 256-bit vector, we can insert into the zero element efficiently
18988 // using a blend if we have AVX or AVX2 and the right data type.
18989 if (VT.is256BitVector() && IdxVal == 0) {
18990 // TODO: It is worthwhile to cast integer to floating point and back
18991 // and incur a domain crossing penalty if that's what we'll end up
18992 // doing anyway after extracting to a 128-bit vector.
18993 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
18994 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
18995 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18996 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
18997 DAG.getTargetConstant(1, dl, MVT::i8));
18998 }
18999 }
19000
19001 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19002 assert(isPowerOf2_32(NumEltsIn128) &&
19003 "Vectors will always have power-of-two number of elements.");
19004
19005 // If we are not inserting into the low 128-bit vector chunk,
19006 // then prefer the broadcast+blend sequence.
19007 // FIXME: relax the profitability check iff all N1 uses are insertions.
19008 if (IdxVal >= NumEltsIn128 &&
19009 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19010 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
19011 X86::mayFoldLoad(N1, Subtarget)))) {
19012 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
19013 SmallVector<int, 8> BlendMask;
19014 for (unsigned i = 0; i != NumElts; ++i)
19015 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19016 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
19017 }
19018
19019 // Get the desired 128-bit vector chunk.
19020 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19021
19022 // Insert the element into the desired chunk.
19023 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19024 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19025
19026 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19027 DAG.getVectorIdxConstant(IdxIn128, dl));
19028
19029 // Insert the changed part back into the bigger vector
19030 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19031 }
19032 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
19033
19034 // This will be just movw/movd/movq/movsh/movss/movsd.
19035 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19036 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19037 EltVT == MVT::f16 || EltVT == MVT::i64) {
19038 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19039 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19040 }
19041
19042 // We can't directly insert an i8 or i16 into a vector, so zero extend
19043 // it to i32 first.
19044 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19045 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19046 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19047 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19048 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19049 return DAG.getBitcast(VT, N1);
19050 }
19051 }
19052
19053 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19054 // argument. SSE41 required for pinsrb.
19055 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19056 unsigned Opc;
19057 if (VT == MVT::v8i16) {
19058 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
19060 } else {
19061 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
19062 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
19064 }
19065
19066 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
19067 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19068 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19069 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19070 }
19071
19072 if (Subtarget.hasSSE41()) {
19073 if (EltVT == MVT::f32) {
19074 // Bits [7:6] of the constant are the source select. This will always be
19075 // zero here. The DAG Combiner may combine an extract_elt index into
19076 // these bits. For example (insert (extract, 3), 2) could be matched by
19077 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19078 // Bits [5:4] of the constant are the destination select. This is the
19079 // value of the incoming immediate.
19080 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19081 // combine either bitwise AND or insert of float 0.0 to set these bits.
19082
19083 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19084 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
19085 // If this is an insertion of 32-bits into the low 32-bits of
19086 // a vector, we prefer to generate a blend with immediate rather
19087 // than an insertps. Blends are simpler operations in hardware and so
19088 // will always have equal or better performance than insertps.
19089 // But if optimizing for size and there's a load folding opportunity,
19090 // generate insertps because blendps does not have a 32-bit memory
19091 // operand form.
19092 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19093 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19094 DAG.getTargetConstant(1, dl, MVT::i8));
19095 }
19096 // Create this as a scalar to vector..
19097 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19098 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19099 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19100 }
19101
19102 // PINSR* works with constant index.
19103 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19104 return Op;
19105 }
19106
19107 return SDValue();
19108}
19109
19111 SelectionDAG &DAG) {
19112 SDLoc dl(Op);
19113 MVT OpVT = Op.getSimpleValueType();
19114
19115 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19116 // combines.
19117 if (X86::isZeroNode(Op.getOperand(0)))
19118 return getZeroVector(OpVT, Subtarget, DAG, dl);
19119
19120 // If this is a 256-bit vector result, first insert into a 128-bit
19121 // vector and then insert into the 256-bit vector.
19122 if (!OpVT.is128BitVector()) {
19123 // Insert into a 128-bit vector.
19124 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19126 OpVT.getVectorNumElements() / SizeFactor);
19127
19128 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19129
19130 // Insert the 128-bit vector.
19131 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19132 }
19133 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
19134 "Expected an SSE type!");
19135
19136 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
19137 // tblgen.
19138 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19139 return Op;
19140
19141 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19142 return DAG.getBitcast(
19143 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19144}
19145
19146// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19147// simple superregister reference or explicit instructions to insert
19148// the upper bits of a vector.
19150 SelectionDAG &DAG) {
19151 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19152
19153 return insert1BitVector(Op, DAG, Subtarget);
19154}
19155
19157 SelectionDAG &DAG) {
19158 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19159 "Only vXi1 extract_subvectors need custom lowering");
19160
19161 SDLoc dl(Op);
19162 SDValue Vec = Op.getOperand(0);
19163 uint64_t IdxVal = Op.getConstantOperandVal(1);
19164
19165 if (IdxVal == 0) // the operation is legal
19166 return Op;
19167
19168 // Extend to natively supported kshift.
19169 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
19170
19171 // Shift to the LSB.
19172 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
19173 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19174
19175 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19176 DAG.getVectorIdxConstant(0, dl));
19177}
19178
19179// Returns the appropriate wrapper opcode for a global reference.
19180unsigned X86TargetLowering::getGlobalWrapperKind(
19181 const GlobalValue *GV, const unsigned char OpFlags) const {
19182 // References to absolute symbols are never PC-relative.
19183 if (GV && GV->isAbsoluteSymbolRef())
19184 return X86ISD::Wrapper;
19185
19186 // The following OpFlags under RIP-rel PIC use RIP.
19187 if (Subtarget.isPICStyleRIPRel() &&
19188 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
19189 OpFlags == X86II::MO_DLLIMPORT))
19190 return X86ISD::WrapperRIP;
19191
19192 // GOTPCREL references must always use RIP.
19193 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
19194 return X86ISD::WrapperRIP;
19195
19196 return X86ISD::Wrapper;
19197}
19198
19199// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19200// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19201// one of the above mentioned nodes. It has to be wrapped because otherwise
19202// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19203// be used to form addressing mode. These wrapped nodes will be selected
19204// into MOV32ri.
19205SDValue
19206X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19207 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19208
19209 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19210 // global base reg.
19211 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19212
19213 auto PtrVT = getPointerTy(DAG.getDataLayout());
19215 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19216 SDLoc DL(CP);
19217 Result =
19218 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19219 // With PIC, the address is actually $g + Offset.
19220 if (OpFlag) {
19221 Result =
19222 DAG.getNode(ISD::ADD, DL, PtrVT,
19223 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19224 }
19225
19226 return Result;
19227}
19228
19229SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19230 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19231
19232 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19233 // global base reg.
19234 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19235
19236 EVT PtrVT = Op.getValueType();
19237 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19238 SDLoc DL(JT);
19239 Result =
19240 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19241
19242 // With PIC, the address is actually $g + Offset.
19243 if (OpFlag)
19244 Result =
19245 DAG.getNode(ISD::ADD, DL, PtrVT,
19246 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19247
19248 return Result;
19249}
19250
19251SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19252 SelectionDAG &DAG) const {
19253 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19254}
19255
19256SDValue
19257X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19258 // Create the TargetBlockAddressAddress node.
19259 unsigned char OpFlags =
19260 Subtarget.classifyBlockAddressReference();
19261 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19262 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19263 SDLoc dl(Op);
19264 EVT PtrVT = Op.getValueType();
19265 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19266 Result =
19267 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
19268
19269 // With PIC, the address is actually $g + Offset.
19270 if (isGlobalRelativeToPICBase(OpFlags)) {
19271 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19272 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19273 }
19274
19275 return Result;
19276}
19277
19278/// Creates target global address or external symbol nodes for calls or
19279/// other uses.
19280SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19281 bool ForCall,
19282 bool *IsImpCall) const {
19283 // Unpack the global address or external symbol.
19284 SDLoc dl(Op);
19285 const GlobalValue *GV = nullptr;
19286 int64_t Offset = 0;
19287 const char *ExternalSym = nullptr;
19288 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19289 GV = G->getGlobal();
19290 Offset = G->getOffset();
19291 } else {
19292 const auto *ES = cast<ExternalSymbolSDNode>(Op);
19293 ExternalSym = ES->getSymbol();
19294 }
19295
19296 // Calculate some flags for address lowering.
19298 unsigned char OpFlags;
19299 if (ForCall)
19300 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19301 else
19302 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19303 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19304 bool NeedsLoad = isGlobalStubReference(OpFlags);
19305
19307 EVT PtrVT = Op.getValueType();
19309
19310 if (GV) {
19311 // Create a target global address if this is a global. If possible, fold the
19312 // offset into the global address reference. Otherwise, ADD it on later.
19313 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19314 // allowed because if the address of foo is 0, the ELF R_X86_64_32
19315 // relocation will compute to a negative value, which is invalid.
19316 int64_t GlobalOffset = 0;
19317 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19319 std::swap(GlobalOffset, Offset);
19320 }
19321 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19322 } else {
19323 // If this is not a global address, this must be an external symbol.
19324 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19325 }
19326
19327 // If this is a direct call, avoid the wrapper if we don't need to do any
19328 // loads or adds. This allows SDAG ISel to match direct calls.
19329 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19330 return Result;
19331
19332 // If Import Call Optimization is enabled and this is an imported function
19333 // then make a note of it and return the global address without wrapping.
19334 if (IsImpCall && (OpFlags == X86II::MO_DLLIMPORT) &&
19335 Mod.getModuleFlag("import-call-optimization")) {
19336 assert(ForCall && "Should only enable import call optimization if we are "
19337 "lowering a call");
19338 *IsImpCall = true;
19339 return Result;
19340 }
19341
19342 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19343
19344 // With PIC, the address is actually $g + Offset.
19345 if (HasPICReg) {
19346 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19347 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19348 }
19349
19350 // For globals that require a load from a stub to get the address, emit the
19351 // load.
19352 if (NeedsLoad)
19353 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19355
19356 // If there was a non-zero offset that we didn't fold, create an explicit
19357 // addition for it.
19358 if (Offset != 0)
19359 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19360 DAG.getSignedConstant(Offset, dl, PtrVT));
19361
19362 return Result;
19363}
19364
19365SDValue
19366X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19367 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19368}
19369
19371 const EVT PtrVT, unsigned ReturnReg,
19372 unsigned char OperandFlags,
19373 bool LoadGlobalBaseReg = false,
19374 bool LocalDynamic = false) {
19376 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19377 SDLoc dl(GA);
19378 SDValue TGA;
19379 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
19380 SDValue Chain = DAG.getEntryNode();
19381 SDValue Ret;
19382 if (LocalDynamic && UseTLSDESC) {
19383 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
19384 // Reuse existing GetTLSADDR node if we can find it.
19385 if (TGA->hasOneUse()) {
19386 // TLSDESC uses TGA.
19387 SDNode *TLSDescOp = *TGA->user_begin();
19388 assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
19389 "Unexpected TLSDESC DAG");
19390 // CALLSEQ_END uses TGA via a chain and glue.
19391 auto *CallSeqEndOp = TLSDescOp->getGluedUser();
19392 assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
19393 "Unexpected TLSDESC DAG");
19394 // CopyFromReg uses CALLSEQ_END via a chain and glue.
19395 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
19396 assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
19397 "Unexpected TLSDESC DAG");
19398 Ret = SDValue(CopyFromRegOp, 0);
19399 }
19400 } else {
19401 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19402 GA->getOffset(), OperandFlags);
19403 }
19404
19405 if (!Ret) {
19406 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
19407 : LocalDynamic ? X86ISD::TLSBASEADDR
19409
19410 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19411 if (LoadGlobalBaseReg) {
19412 SDValue InGlue;
19413 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
19414 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),
19415 InGlue);
19416 InGlue = Chain.getValue(1);
19417 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
19418 } else {
19419 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});
19420 }
19421 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);
19422
19423 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19424 MFI.setHasCalls(true);
19425
19426 SDValue Glue = Chain.getValue(1);
19427 Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
19428 }
19429
19430 if (!UseTLSDESC)
19431 return Ret;
19432
19433 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
19434 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
19435
19437 SDValue Offset =
19438 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19440 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
19441}
19442
19443// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19444static SDValue
19446 const EVT PtrVT) {
19447 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
19448 /*LoadGlobalBaseReg=*/true);
19449}
19450
19451// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19452static SDValue
19454 const EVT PtrVT) {
19455 return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
19456}
19457
19458// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19459static SDValue
19461 const EVT PtrVT) {
19462 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
19463}
19464
19466 SelectionDAG &DAG, const EVT PtrVT,
19467 bool Is64Bit, bool Is64BitLP64) {
19468 SDLoc dl(GA);
19469
19470 // Get the start address of the TLS block for this module.
19474
19475 SDValue Base;
19476 if (Is64Bit) {
19477 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19478 Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,
19479 /*LoadGlobalBaseReg=*/false,
19480 /*LocalDynamic=*/true);
19481 } else {
19482 Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,
19483 /*LoadGlobalBaseReg=*/true,
19484 /*LocalDynamic=*/true);
19485 }
19486
19487 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19488 // of Base.
19489
19490 // Build x@dtpoff.
19491 unsigned char OperandFlags = X86II::MO_DTPOFF;
19492 unsigned WrapperKind = X86ISD::Wrapper;
19493 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19494 GA->getValueType(0),
19495 GA->getOffset(), OperandFlags);
19496 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19497
19498 // Add x@dtpoff with the base.
19499 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19500}
19501
19502// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19504 const EVT PtrVT, TLSModel::Model model,
19505 bool is64Bit, bool isPIC) {
19506 SDLoc dl(GA);
19507
19508 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19511
19512 SDValue ThreadPointer =
19513 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19515
19516 unsigned char OperandFlags = 0;
19517 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19518 // initialexec.
19519 unsigned WrapperKind = X86ISD::Wrapper;
19520 if (model == TLSModel::LocalExec) {
19521 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19522 } else if (model == TLSModel::InitialExec) {
19523 if (is64Bit) {
19524 OperandFlags = X86II::MO_GOTTPOFF;
19525 WrapperKind = X86ISD::WrapperRIP;
19526 } else {
19527 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19528 }
19529 } else {
19530 llvm_unreachable("Unexpected model");
19531 }
19532
19533 // emit "addl x@ntpoff,%eax" (local exec)
19534 // or "addl x@indntpoff,%eax" (initial exec)
19535 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19536 SDValue TGA =
19537 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19538 GA->getOffset(), OperandFlags);
19539 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19540
19541 if (model == TLSModel::InitialExec) {
19542 if (isPIC && !is64Bit) {
19543 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19544 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19545 Offset);
19546 }
19547
19548 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19550 }
19551
19552 // The address of the thread local variable is the add of the thread
19553 // pointer with the offset of the variable.
19554 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19555}
19556
19557SDValue
19558X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19559
19560 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19561
19562 if (DAG.getTarget().useEmulatedTLS())
19563 return LowerToTLSEmulatedModel(GA, DAG);
19564
19565 const GlobalValue *GV = GA->getGlobal();
19566 EVT PtrVT = Op.getValueType();
19567 bool PositionIndependent = isPositionIndependent();
19568
19569 if (Subtarget.isTargetELF()) {
19570 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19571 switch (model) {
19573 if (Subtarget.is64Bit()) {
19574 if (Subtarget.isTarget64BitLP64())
19575 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19576 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19577 }
19578 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19580 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19581 Subtarget.isTarget64BitLP64());
19584 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19585 PositionIndependent);
19586 }
19587 llvm_unreachable("Unknown TLS model.");
19588 }
19589
19590 if (Subtarget.isTargetDarwin()) {
19591 // Darwin only has one model of TLS. Lower to that.
19592 unsigned char OpFlag = 0;
19593 unsigned WrapperKind = 0;
19594
19595 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19596 // global base reg.
19597 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19598 if (PIC32) {
19599 OpFlag = X86II::MO_TLVP_PIC_BASE;
19600 WrapperKind = X86ISD::Wrapper;
19601 } else {
19602 OpFlag = X86II::MO_TLVP;
19603 WrapperKind = X86ISD::WrapperRIP;
19604 }
19605 SDLoc DL(Op);
19607 GA->getValueType(0),
19608 GA->getOffset(), OpFlag);
19609 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19610
19611 // With PIC32, the address is actually $g + Offset.
19612 if (PIC32)
19613 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19614 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19615 Offset);
19616
19617 // Lowering the machine isd will make sure everything is in the right
19618 // location.
19619 SDValue Chain = DAG.getEntryNode();
19620 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19621 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19622 SDValue Args[] = { Chain, Offset };
19623 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19624 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
19625
19626 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19627 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19628 MFI.setAdjustsStack(true);
19629
19630 // And our return value (tls address) is in the standard call return value
19631 // location.
19632 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19633 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19634 }
19635
19636 if (Subtarget.isOSWindows()) {
19637 // Just use the implicit TLS architecture
19638 // Need to generate something similar to:
19639 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19640 // ; from TEB
19641 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19642 // mov rcx, qword [rdx+rcx*8]
19643 // mov eax, .tls$:tlsvar
19644 // [rax+rcx] contains the address
19645 // Windows 64bit: gs:0x58
19646 // Windows 32bit: fs:__tls_array
19647
19648 SDLoc dl(GA);
19649 SDValue Chain = DAG.getEntryNode();
19650
19651 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19652 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19653 // use its literal value of 0x2C.
19655 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), X86AS::GS)
19657
19658 SDValue TlsArray = Subtarget.is64Bit()
19659 ? DAG.getIntPtrConstant(0x58, dl)
19660 : (Subtarget.isTargetWindowsGNU()
19661 ? DAG.getIntPtrConstant(0x2C, dl)
19662 : DAG.getExternalSymbol("_tls_array", PtrVT));
19663
19665 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19666
19667 SDValue res;
19669 res = ThreadPointer;
19670 } else {
19671 // Load the _tls_index variable
19672 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19673 if (Subtarget.is64Bit())
19674 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19675 MachinePointerInfo(), MVT::i32);
19676 else
19677 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19678
19679 const DataLayout &DL = DAG.getDataLayout();
19680 SDValue Scale =
19681 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19682 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19683
19684 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19685 }
19686
19687 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19688
19689 // Get the offset of start of .tls section
19690 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19691 GA->getValueType(0),
19693 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19694
19695 // The address of the thread local variable is the add of the thread
19696 // pointer with the offset of the variable.
19697 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19698 }
19699
19700 llvm_unreachable("TLS not implemented for this target.");
19701}
19702
19704 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
19705 const TargetMachine &TM = getTargetMachine();
19706 TLSModel::Model Model = TM.getTLSModel(&GV);
19707 switch (Model) {
19710 // We can include the %fs segment register in addressing modes.
19711 return true;
19714 // These models do not result in %fs relative addresses unless
19715 // TLS descriptior are used.
19716 //
19717 // Even in the case of TLS descriptors we currently have no way to model
19718 // the difference between %fs access and the computations needed for the
19719 // offset and returning `true` for TLS-desc currently duplicates both
19720 // which is detrimental :-/
19721 return false;
19722 }
19723 }
19724 return false;
19725}
19726
19727/// Lower SRA_PARTS and friends, which return two i32 values
19728/// and take a 2 x i32 value to shift plus a shift amount.
19729/// TODO: Can this be moved to general expansion code?
19731 SDValue Lo, Hi;
19732 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19733 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19734}
19735
19736// Try to use a packed vector operation to handle i64 on 32-bit targets when
19737// AVX512DQ is enabled.
19739 SelectionDAG &DAG,
19740 const X86Subtarget &Subtarget) {
19741 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19742 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19743 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19744 Op.getOpcode() == ISD::UINT_TO_FP) &&
19745 "Unexpected opcode!");
19746 bool IsStrict = Op->isStrictFPOpcode();
19747 unsigned OpNo = IsStrict ? 1 : 0;
19748 SDValue Src = Op.getOperand(OpNo);
19749 MVT SrcVT = Src.getSimpleValueType();
19750 MVT VT = Op.getSimpleValueType();
19751
19752 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19753 (VT != MVT::f32 && VT != MVT::f64))
19754 return SDValue();
19755
19756 // Pack the i64 into a vector, do the operation and extract.
19757
19758 // Using 256-bit to ensure result is 128-bits for f32 case.
19759 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19760 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19761 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19762
19763 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19764 if (IsStrict) {
19765 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19766 {Op.getOperand(0), InVec});
19767 SDValue Chain = CvtVec.getValue(1);
19768 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19769 DAG.getVectorIdxConstant(0, dl));
19770 return DAG.getMergeValues({Value, Chain}, dl);
19771 }
19772
19773 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19774
19775 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19776 DAG.getVectorIdxConstant(0, dl));
19777}
19778
19779// Try to use a packed vector operation to handle i64 on 32-bit targets.
19781 const X86Subtarget &Subtarget) {
19782 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19783 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19784 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19785 Op.getOpcode() == ISD::UINT_TO_FP) &&
19786 "Unexpected opcode!");
19787 bool IsStrict = Op->isStrictFPOpcode();
19788 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19789 MVT SrcVT = Src.getSimpleValueType();
19790 MVT VT = Op.getSimpleValueType();
19791
19792 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19793 return SDValue();
19794
19795 // Pack the i64 into a vector, do the operation and extract.
19796
19797 assert(Subtarget.hasFP16() && "Expected FP16");
19798
19799 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
19800 if (IsStrict) {
19801 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19802 {Op.getOperand(0), InVec});
19803 SDValue Chain = CvtVec.getValue(1);
19804 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19805 DAG.getVectorIdxConstant(0, dl));
19806 return DAG.getMergeValues({Value, Chain}, dl);
19807 }
19808
19809 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
19810
19811 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19812 DAG.getVectorIdxConstant(0, dl));
19813}
19814
19815static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19816 const X86Subtarget &Subtarget) {
19817 switch (Opcode) {
19818 case ISD::SINT_TO_FP:
19819 // TODO: Handle wider types with AVX/AVX512.
19820 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19821 return false;
19822 // CVTDQ2PS or (V)CVTDQ2PD
19823 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19824
19825 case ISD::UINT_TO_FP:
19826 // TODO: Handle wider types and i64 elements.
19827 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19828 return false;
19829 // VCVTUDQ2PS or VCVTUDQ2PD
19830 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19831
19832 default:
19833 return false;
19834 }
19835}
19836
19837/// Given a scalar cast operation that is extracted from a vector, try to
19838/// vectorize the cast op followed by extraction. This will avoid an expensive
19839/// round-trip between XMM and GPR.
19841 SelectionDAG &DAG,
19842 const X86Subtarget &Subtarget) {
19843 // TODO: This could be enhanced to handle smaller integer types by peeking
19844 // through an extend.
19845 SDValue Extract = Cast.getOperand(0);
19846 MVT DestVT = Cast.getSimpleValueType();
19847 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19848 !isa<ConstantSDNode>(Extract.getOperand(1)))
19849 return SDValue();
19850
19851 // See if we have a 128-bit vector cast op for this type of cast.
19852 SDValue VecOp = Extract.getOperand(0);
19853 MVT FromVT = VecOp.getSimpleValueType();
19854 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19855 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19856 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19857 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19858 return SDValue();
19859
19860 // If we are extracting from a non-zero element, first shuffle the source
19861 // vector to allow extracting from element zero.
19862 if (!isNullConstant(Extract.getOperand(1))) {
19863 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19864 Mask[0] = Extract.getConstantOperandVal(1);
19865 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19866 }
19867 // If the source vector is wider than 128-bits, extract the low part. Do not
19868 // create an unnecessarily wide vector cast op.
19869 if (FromVT != Vec128VT)
19870 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19871
19872 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19873 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19874 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19875 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19876 DAG.getVectorIdxConstant(0, DL));
19877}
19878
19879/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19880/// try to vectorize the cast ops. This will avoid an expensive round-trip
19881/// between XMM and GPR.
19882static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
19883 SelectionDAG &DAG,
19884 const X86Subtarget &Subtarget) {
19885 // TODO: Allow FP_TO_UINT.
19886 SDValue CastToInt = CastToFP.getOperand(0);
19887 MVT VT = CastToFP.getSimpleValueType();
19888 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19889 return SDValue();
19890
19891 MVT IntVT = CastToInt.getSimpleValueType();
19892 SDValue X = CastToInt.getOperand(0);
19893 MVT SrcVT = X.getSimpleValueType();
19894 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19895 return SDValue();
19896
19897 // See if we have 128-bit vector cast instructions for this type of cast.
19898 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19899 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19900 IntVT != MVT::i32)
19901 return SDValue();
19902
19903 unsigned SrcSize = SrcVT.getSizeInBits();
19904 unsigned IntSize = IntVT.getSizeInBits();
19905 unsigned VTSize = VT.getSizeInBits();
19906 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19907 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19908 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19909
19910 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19911 unsigned ToIntOpcode =
19912 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19913 unsigned ToFPOpcode =
19914 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19915
19916 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19917 //
19918 // We are not defining the high elements (for example, zero them) because
19919 // that could nullify any performance advantage that we hoped to gain from
19920 // this vector op hack. We do not expect any adverse effects (like denorm
19921 // penalties) with cast ops.
19922 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19923 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19924 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19925 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19926 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19927}
19928
19930 SelectionDAG &DAG,
19931 const X86Subtarget &Subtarget) {
19932 bool IsStrict = Op->isStrictFPOpcode();
19933 MVT VT = Op->getSimpleValueType(0);
19934 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19935
19936 if (Subtarget.hasDQI()) {
19937 assert(!Subtarget.hasVLX() && "Unexpected features");
19938
19939 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19940 Src.getSimpleValueType() == MVT::v4i64) &&
19941 "Unsupported custom type");
19942
19943 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19944 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19945 "Unexpected VT!");
19946 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19947
19948 // Need to concat with zero vector for strict fp to avoid spurious
19949 // exceptions.
19950 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19951 : DAG.getUNDEF(MVT::v8i64);
19952 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19953 DAG.getVectorIdxConstant(0, DL));
19954 SDValue Res, Chain;
19955 if (IsStrict) {
19956 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19957 {Op->getOperand(0), Src});
19958 Chain = Res.getValue(1);
19959 } else {
19960 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19961 }
19962
19963 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19964 DAG.getVectorIdxConstant(0, DL));
19965
19966 if (IsStrict)
19967 return DAG.getMergeValues({Res, Chain}, DL);
19968 return Res;
19969 }
19970
19971 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19972 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19973 if (VT != MVT::v4f32 || IsSigned)
19974 return SDValue();
19975
19976 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
19977 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
19978 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
19979 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
19980 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
19981 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
19982 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
19983 SmallVector<SDValue, 4> SignCvts(4);
19984 SmallVector<SDValue, 4> Chains(4);
19985 for (int i = 0; i != 4; ++i) {
19986 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
19987 DAG.getVectorIdxConstant(i, DL));
19988 if (IsStrict) {
19989 SignCvts[i] =
19990 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
19991 {Op.getOperand(0), Elt});
19992 Chains[i] = SignCvts[i].getValue(1);
19993 } else {
19994 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
19995 }
19996 }
19997 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
19998
19999 SDValue Slow, Chain;
20000 if (IsStrict) {
20001 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20002 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20003 {Chain, SignCvt, SignCvt});
20004 Chain = Slow.getValue(1);
20005 } else {
20006 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20007 }
20008
20009 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20010 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20011
20012 if (IsStrict)
20013 return DAG.getMergeValues({Cvt, Chain}, DL);
20014
20015 return Cvt;
20016}
20017
20019 SelectionDAG &DAG) {
20020 bool IsStrict = Op->isStrictFPOpcode();
20021 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20022 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20023 MVT VT = Op.getSimpleValueType();
20024 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20025
20026 SDValue Rnd = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);
20027 if (IsStrict)
20028 return DAG.getNode(
20029 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
20030 {Chain,
20031 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
20032 Rnd});
20033 return DAG.getNode(ISD::FP_ROUND, dl, VT,
20034 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
20035}
20036
20037static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned,
20038 const X86Subtarget &Subtarget) {
20039 if (FloatVT.getScalarType() != MVT::f16 || Subtarget.hasVLX()) {
20040 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
20041 return true;
20042 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
20043 return true;
20044 }
20045 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
20046 return true;
20047 if (Subtarget.useAVX512Regs()) {
20048 if (VT == MVT::v16i32)
20049 return true;
20050 if (VT == MVT::v8i64 && FloatVT == MVT::v8f16 && Subtarget.hasFP16())
20051 return true;
20052 if (VT == MVT::v8i64 && Subtarget.hasDQI())
20053 return true;
20054 }
20055 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
20056 (VT == MVT::v2i64 || VT == MVT::v4i64))
20057 return true;
20058 return false;
20059}
20060
20061SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20062 SelectionDAG &DAG) const {
20063 bool IsStrict = Op->isStrictFPOpcode();
20064 unsigned OpNo = IsStrict ? 1 : 0;
20065 SDValue Src = Op.getOperand(OpNo);
20066 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20067 MVT SrcVT = Src.getSimpleValueType();
20068 MVT VT = Op.getSimpleValueType();
20069 SDLoc dl(Op);
20070
20071 if (isSoftF16(VT, Subtarget))
20072 return promoteXINT_TO_FP(Op, dl, DAG);
20073 else if (isLegalConversion(SrcVT, VT, true, Subtarget))
20074 return Op;
20075
20076 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20077 return LowerWin64_INT128_TO_FP(Op, DAG);
20078
20079 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20080 return Extract;
20081
20082 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
20083 return R;
20084
20085 if (SrcVT.isVector()) {
20086 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20087 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20088 // source for strict FP.
20089 if (IsStrict)
20090 return DAG.getNode(
20091 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20092 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20093 DAG.getUNDEF(SrcVT))});
20094 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20095 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20096 DAG.getUNDEF(SrcVT)));
20097 }
20098 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20099 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20100
20101 return SDValue();
20102 }
20103
20104 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
20105 "Unknown SINT_TO_FP to lower!");
20106
20107 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20108
20109 // These are really Legal; return the operand so the caller accepts it as
20110 // Legal.
20111 if (SrcVT == MVT::i32 && UseSSEReg)
20112 return Op;
20113 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20114 return Op;
20115
20116 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20117 return V;
20118 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20119 return V;
20120
20121 // SSE doesn't have an i16 conversion so we need to promote.
20122 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20123 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20124 if (IsStrict)
20125 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20126 {Chain, Ext});
20127
20128 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20129 }
20130
20131 if (VT == MVT::f128 || !Subtarget.hasX87())
20132 return SDValue();
20133
20134 SDValue ValueToStore = Src;
20135 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20136 // Bitcasting to f64 here allows us to do a single 64-bit store from
20137 // an SSE register, avoiding the store forwarding penalty that would come
20138 // with two 32-bit stores.
20139 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20140
20141 unsigned Size = SrcVT.getStoreSize();
20142 Align Alignment(Size);
20143 MachineFunction &MF = DAG.getMachineFunction();
20144 auto PtrVT = getPointerTy(MF.getDataLayout());
20145 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20146 MachinePointerInfo MPI =
20148 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20149 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20150 std::pair<SDValue, SDValue> Tmp =
20151 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20152
20153 if (IsStrict)
20154 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20155
20156 return Tmp.first;
20157}
20158
20159std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20160 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20161 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20162 // Build the FILD
20163 SDVTList Tys;
20164 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20165 if (useSSE)
20166 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20167 else
20168 Tys = DAG.getVTList(DstVT, MVT::Other);
20169
20170 SDValue FILDOps[] = {Chain, Pointer};
20171 SDValue Result =
20172 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20173 Alignment, MachineMemOperand::MOLoad);
20174 Chain = Result.getValue(1);
20175
20176 if (useSSE) {
20178 unsigned SSFISize = DstVT.getStoreSize();
20179 int SSFI =
20180 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20181 auto PtrVT = getPointerTy(MF.getDataLayout());
20182 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20183 Tys = DAG.getVTList(MVT::Other);
20184 SDValue FSTOps[] = {Chain, Result, StackSlot};
20187 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20188
20189 Chain =
20190 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20191 Result = DAG.getLoad(
20192 DstVT, DL, Chain, StackSlot,
20194 Chain = Result.getValue(1);
20195 }
20196
20197 return { Result, Chain };
20198}
20199
20200/// Horizontal vector math instructions may be slower than normal math with
20201/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20202/// implementation, and likely shuffle complexity of the alternate sequence.
20203static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20204 const X86Subtarget &Subtarget) {
20205 bool IsOptimizingSize = DAG.shouldOptForSize();
20206 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20207 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20208}
20209
20210/// 64-bit unsigned integer to double expansion.
20212 SelectionDAG &DAG,
20213 const X86Subtarget &Subtarget) {
20214 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20215 // when converting 0 when rounding toward negative infinity. Caller will
20216 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20217 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
20218 // This algorithm is not obvious. Here it is what we're trying to output:
20219 /*
20220 movq %rax, %xmm0
20221 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20222 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20223 #ifdef __SSE3__
20224 haddpd %xmm0, %xmm0
20225 #else
20226 pshufd $0x4e, %xmm0, %xmm1
20227 addpd %xmm1, %xmm0
20228 #endif
20229 */
20230
20231 LLVMContext *Context = DAG.getContext();
20232
20233 // Build some magic constants.
20234 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20235 Constant *C0 = ConstantDataVector::get(*Context, CV0);
20236 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20237 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20238
20240 CV1.push_back(
20241 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20242 APInt(64, 0x4330000000000000ULL))));
20243 CV1.push_back(
20244 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20245 APInt(64, 0x4530000000000000ULL))));
20246 Constant *C1 = ConstantVector::get(CV1);
20247 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20248
20249 // Load the 64-bit value into an XMM register.
20250 SDValue XR1 =
20251 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20252 SDValue CLod0 = DAG.getLoad(
20253 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20255 SDValue Unpck1 =
20256 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20257
20258 SDValue CLod1 = DAG.getLoad(
20259 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20261 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20262 // TODO: Are there any fast-math-flags to propagate here?
20263 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20264 SDValue Result;
20265
20266 if (Subtarget.hasSSE3() &&
20267 shouldUseHorizontalOp(true, DAG, Subtarget)) {
20268 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20269 } else {
20270 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20271 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20272 }
20273 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20274 DAG.getVectorIdxConstant(0, dl));
20275 return Result;
20276}
20277
20278/// 32-bit unsigned integer to float expansion.
20280 SelectionDAG &DAG,
20281 const X86Subtarget &Subtarget) {
20282 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20283 // FP constant to bias correct the final result.
20284 SDValue Bias = DAG.getConstantFP(
20285 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
20286
20287 // Load the 32-bit value into an XMM register.
20288 SDValue Load =
20289 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20290
20291 // Zero out the upper parts of the register.
20292 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20293
20294 // Or the load with the bias.
20295 SDValue Or = DAG.getNode(
20296 ISD::OR, dl, MVT::v2i64,
20297 DAG.getBitcast(MVT::v2i64, Load),
20298 DAG.getBitcast(MVT::v2i64,
20299 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20300 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20301 DAG.getBitcast(MVT::v2f64, Or),
20302 DAG.getVectorIdxConstant(0, dl));
20303
20304 if (Op.getNode()->isStrictFPOpcode()) {
20305 // Subtract the bias.
20306 // TODO: Are there any fast-math-flags to propagate here?
20307 SDValue Chain = Op.getOperand(0);
20308 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20309 {Chain, Or, Bias});
20310
20311 if (Op.getValueType() == Sub.getValueType())
20312 return Sub;
20313
20314 // Handle final rounding.
20315 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20316 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20317
20318 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20319 }
20320
20321 // Subtract the bias.
20322 // TODO: Are there any fast-math-flags to propagate here?
20323 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20324
20325 // Handle final rounding.
20326 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20327}
20328
20330 SelectionDAG &DAG,
20331 const X86Subtarget &Subtarget) {
20332 if (Op.getSimpleValueType() != MVT::v2f64)
20333 return SDValue();
20334
20335 bool IsStrict = Op->isStrictFPOpcode();
20336
20337 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20338 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
20339
20340 if (Subtarget.hasAVX512()) {
20341 if (!Subtarget.hasVLX()) {
20342 // Let generic type legalization widen this.
20343 if (!IsStrict)
20344 return SDValue();
20345 // Otherwise pad the integer input with 0s and widen the operation.
20346 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20347 DAG.getConstant(0, DL, MVT::v2i32));
20348 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20349 {Op.getOperand(0), N0});
20350 SDValue Chain = Res.getValue(1);
20351 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20352 DAG.getVectorIdxConstant(0, DL));
20353 return DAG.getMergeValues({Res, Chain}, DL);
20354 }
20355
20356 // Legalize to v4i32 type.
20357 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20358 DAG.getUNDEF(MVT::v2i32));
20359 if (IsStrict)
20360 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20361 {Op.getOperand(0), N0});
20362 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20363 }
20364
20365 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20366 // This gives us the floating point equivalent of 2^52 + the i32 integer
20367 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20368 // point leaving just our i32 integers in double format.
20369 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20370 SDValue VBias = DAG.getConstantFP(
20371 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
20372 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20373 DAG.getBitcast(MVT::v2i64, VBias));
20374 Or = DAG.getBitcast(MVT::v2f64, Or);
20375
20376 if (IsStrict)
20377 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20378 {Op.getOperand(0), Or, VBias});
20379 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20380}
20381
20383 SelectionDAG &DAG,
20384 const X86Subtarget &Subtarget) {
20385 bool IsStrict = Op->isStrictFPOpcode();
20386 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20387 MVT VecIntVT = V.getSimpleValueType();
20388 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20389 "Unsupported custom type");
20390
20391 if (Subtarget.hasAVX512()) {
20392 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20393 assert(!Subtarget.hasVLX() && "Unexpected features");
20394 MVT VT = Op->getSimpleValueType(0);
20395
20396 // v8i32->v8f64 is legal with AVX512 so just return it.
20397 if (VT == MVT::v8f64)
20398 return Op;
20399
20400 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64 ||
20401 VT == MVT::v8f16) &&
20402 "Unexpected VT!");
20403 MVT WideVT = VT == MVT::v8f16 ? MVT::v16f16 : MVT::v16f32;
20404 MVT WideIntVT = MVT::v16i32;
20405 if (VT == MVT::v4f64) {
20406 WideVT = MVT::v8f64;
20407 WideIntVT = MVT::v8i32;
20408 }
20409
20410 // Need to concat with zero vector for strict fp to avoid spurious
20411 // exceptions.
20412 SDValue Tmp =
20413 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20414 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20415 DAG.getVectorIdxConstant(0, DL));
20416 SDValue Res, Chain;
20417 if (IsStrict) {
20418 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20419 {Op->getOperand(0), V});
20420 Chain = Res.getValue(1);
20421 } else {
20422 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20423 }
20424
20425 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20426 DAG.getVectorIdxConstant(0, DL));
20427
20428 if (IsStrict)
20429 return DAG.getMergeValues({Res, Chain}, DL);
20430 return Res;
20431 }
20432
20433 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20434 Op->getSimpleValueType(0) == MVT::v4f64) {
20435 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20436 Constant *Bias = ConstantFP::get(
20437 *DAG.getContext(),
20438 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20439 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20440 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20441 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20442 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20443 SDValue VBias = DAG.getMemIntrinsicNode(
20444 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20447
20448 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20449 DAG.getBitcast(MVT::v4i64, VBias));
20450 Or = DAG.getBitcast(MVT::v4f64, Or);
20451
20452 if (IsStrict)
20453 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20454 {Op.getOperand(0), Or, VBias});
20455 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20456 }
20457
20458 // The algorithm is the following:
20459 // #ifdef __SSE4_1__
20460 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20461 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20462 // (uint4) 0x53000000, 0xaa);
20463 // #else
20464 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20465 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20466 // #endif
20467 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20468 // return (float4) lo + fhi;
20469
20470 bool Is128 = VecIntVT == MVT::v4i32;
20471 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20472 // If we convert to something else than the supported type, e.g., to v4f64,
20473 // abort early.
20474 if (VecFloatVT != Op->getSimpleValueType(0))
20475 return SDValue();
20476
20477 // In the #idef/#else code, we have in common:
20478 // - The vector of constants:
20479 // -- 0x4b000000
20480 // -- 0x53000000
20481 // - A shift:
20482 // -- v >> 16
20483
20484 // Create the splat vector for 0x4b000000.
20485 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20486 // Create the splat vector for 0x53000000.
20487 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20488
20489 // Create the right shift.
20490 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20491 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20492
20493 SDValue Low, High;
20494 if (Subtarget.hasSSE41()) {
20495 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20496 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20497 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20498 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20499 // Low will be bitcasted right away, so do not bother bitcasting back to its
20500 // original type.
20501 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20502 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20503 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20504 // (uint4) 0x53000000, 0xaa);
20505 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20506 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20507 // High will be bitcasted right away, so do not bother bitcasting back to
20508 // its original type.
20509 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20510 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20511 } else {
20512 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20513 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20514 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20515 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20516
20517 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20518 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20519 }
20520
20521 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20522 SDValue VecCstFSub = DAG.getConstantFP(
20523 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20524
20525 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20526 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20527 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20528 // enabled. See PR24512.
20529 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20530 // TODO: Are there any fast-math-flags to propagate here?
20531 // (float4) lo;
20532 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20533 // return (float4) lo + fhi;
20534 if (IsStrict) {
20535 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20536 {Op.getOperand(0), HighBitcast, VecCstFSub});
20537 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20538 {FHigh.getValue(1), LowBitcast, FHigh});
20539 }
20540
20541 SDValue FHigh =
20542 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20543 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20544}
20545
20547 const X86Subtarget &Subtarget) {
20548 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20549 SDValue N0 = Op.getOperand(OpNo);
20550 MVT SrcVT = N0.getSimpleValueType();
20551
20552 switch (SrcVT.SimpleTy) {
20553 default:
20554 llvm_unreachable("Custom UINT_TO_FP is not supported!");
20555 case MVT::v2i32:
20556 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
20557 case MVT::v4i32:
20558 case MVT::v8i32:
20559 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
20560 case MVT::v2i64:
20561 case MVT::v4i64:
20562 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20563 }
20564}
20565
20566SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20567 SelectionDAG &DAG) const {
20568 bool IsStrict = Op->isStrictFPOpcode();
20569 unsigned OpNo = IsStrict ? 1 : 0;
20570 SDValue Src = Op.getOperand(OpNo);
20571 SDLoc dl(Op);
20572 auto PtrVT = getPointerTy(DAG.getDataLayout());
20573 MVT SrcVT = Src.getSimpleValueType();
20574 MVT DstVT = Op->getSimpleValueType(0);
20575 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20576
20577 // Bail out when we don't have native conversion instructions.
20578 if (DstVT == MVT::f128)
20579 return SDValue();
20580
20581 if (isSoftF16(DstVT, Subtarget))
20582 return promoteXINT_TO_FP(Op, dl, DAG);
20583 else if (isLegalConversion(SrcVT, DstVT, false, Subtarget))
20584 return Op;
20585
20586 if (DstVT.isVector())
20587 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
20588
20589 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20590 return LowerWin64_INT128_TO_FP(Op, DAG);
20591
20592 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20593 return Extract;
20594
20595 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20596 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20597 // Conversions from unsigned i32 to f32/f64 are legal,
20598 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20599 return Op;
20600 }
20601
20602 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20603 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20604 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20605 if (IsStrict)
20606 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20607 {Chain, Src});
20608 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20609 }
20610
20611 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20612 return V;
20613 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20614 return V;
20615
20616 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20617 // infinity. It produces -0.0, so disable under strictfp.
20618 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
20619 !IsStrict)
20620 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
20621 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
20622 // negative infinity. So disable under strictfp. Using FILD instead.
20623 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
20624 !IsStrict)
20625 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
20626 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20627 (DstVT == MVT::f32 || DstVT == MVT::f64))
20628 return SDValue();
20629
20630 // Make a 64-bit buffer, and use it to build an FILD.
20631 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20632 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20633 Align SlotAlign(8);
20634 MachinePointerInfo MPI =
20636 if (SrcVT == MVT::i32) {
20637 SDValue OffsetSlot =
20638 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
20639 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20640 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20641 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20642 std::pair<SDValue, SDValue> Tmp =
20643 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20644 if (IsStrict)
20645 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20646
20647 return Tmp.first;
20648 }
20649
20650 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
20651 SDValue ValueToStore = Src;
20652 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20653 // Bitcasting to f64 here allows us to do a single 64-bit store from
20654 // an SSE register, avoiding the store forwarding penalty that would come
20655 // with two 32-bit stores.
20656 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20657 }
20658 SDValue Store =
20659 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20660 // For i64 source, we need to add the appropriate power of 2 if the input
20661 // was negative. We must be careful to do the computation in x87 extended
20662 // precision, not in SSE.
20663 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20664 SDValue Ops[] = {Store, StackSlot};
20665 SDValue Fild =
20666 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20667 SlotAlign, MachineMemOperand::MOLoad);
20668 Chain = Fild.getValue(1);
20669
20670 // Check whether the sign bit is set.
20671 SDValue SignSet = DAG.getSetCC(
20672 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20673 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20674
20675 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20676 APInt FF(64, 0x5F80000000000000ULL);
20677 SDValue FudgePtr =
20678 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20679 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20680
20681 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20682 SDValue Zero = DAG.getIntPtrConstant(0, dl);
20683 SDValue Four = DAG.getIntPtrConstant(4, dl);
20684 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20685 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20686
20687 // Load the value out, extending it from f32 to f80.
20688 SDValue Fudge = DAG.getExtLoad(
20689 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20691 CPAlignment);
20692 Chain = Fudge.getValue(1);
20693 // Extend everything to 80 bits to force it to be done on x87.
20694 // TODO: Are there any fast-math-flags to propagate here?
20695 if (IsStrict) {
20696 unsigned Opc = ISD::STRICT_FADD;
20697 // Windows needs the precision control changed to 80bits around this add.
20698 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20700
20701 SDValue Add =
20702 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
20703 // STRICT_FP_ROUND can't handle equal types.
20704 if (DstVT == MVT::f80)
20705 return Add;
20706 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20707 {Add.getValue(1), Add,
20708 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
20709 }
20710 unsigned Opc = ISD::FADD;
20711 // Windows needs the precision control changed to 80bits around this add.
20712 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20714
20715 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
20716 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20717 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
20718}
20719
20720// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20721// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20722// just return an SDValue().
20723// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20724// to i16, i32 or i64, and we lower it to a legal sequence and return the
20725// result.
20726SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20727 bool IsSigned,
20728 SDValue &Chain) const {
20729 bool IsStrict = Op->isStrictFPOpcode();
20730 SDLoc DL(Op);
20731
20732 EVT DstTy = Op.getValueType();
20733 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20734 EVT TheVT = Value.getValueType();
20735 auto PtrVT = getPointerTy(DAG.getDataLayout());
20736
20737 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20738 // f16 must be promoted before using the lowering in this routine.
20739 // fp128 does not use this lowering.
20740 return SDValue();
20741 }
20742
20743 // If using FIST to compute an unsigned i64, we'll need some fixup
20744 // to handle values above the maximum signed i64. A FIST is always
20745 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20746 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20747
20748 // FIXME: This does not generate an invalid exception if the input does not
20749 // fit in i32. PR44019
20750 if (!IsSigned && DstTy != MVT::i64) {
20751 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20752 // The low 32 bits of the fist result will have the correct uint32 result.
20753 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20754 DstTy = MVT::i64;
20755 }
20756
20757 assert(DstTy.getSimpleVT() <= MVT::i64 &&
20758 DstTy.getSimpleVT() >= MVT::i16 &&
20759 "Unknown FP_TO_INT to lower!");
20760
20761 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20762 // stack slot.
20763 MachineFunction &MF = DAG.getMachineFunction();
20764 unsigned MemSize = DstTy.getStoreSize();
20765 int SSFI =
20766 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20767 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20768
20769 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20770
20771 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20772
20773 if (UnsignedFixup) {
20774 //
20775 // Conversion to unsigned i64 is implemented with a select,
20776 // depending on whether the source value fits in the range
20777 // of a signed i64. Let Thresh be the FP equivalent of
20778 // 0x8000000000000000ULL.
20779 //
20780 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20781 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20782 // FistSrc = (Value - FltOfs);
20783 // Fist-to-mem64 FistSrc
20784 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20785 // to XOR'ing the high 32 bits with Adjust.
20786 //
20787 // Being a power of 2, Thresh is exactly representable in all FP formats.
20788 // For X87 we'd like to use the smallest FP type for this constant, but
20789 // for DAG type consistency we have to match the FP operand type.
20790
20791 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20793 bool LosesInfo = false;
20794 if (TheVT == MVT::f64)
20795 // The rounding mode is irrelevant as the conversion should be exact.
20796 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
20797 &LosesInfo);
20798 else if (TheVT == MVT::f80)
20799 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20800 APFloat::rmNearestTiesToEven, &LosesInfo);
20801
20802 assert(Status == APFloat::opOK && !LosesInfo &&
20803 "FP conversion should have been exact");
20804
20805 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20806
20807 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20808 *DAG.getContext(), TheVT);
20809 SDValue Cmp;
20810 if (IsStrict) {
20811 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20812 /*IsSignaling*/ true);
20813 Chain = Cmp.getValue(1);
20814 } else {
20815 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20816 }
20817
20818 // Our preferred lowering of
20819 //
20820 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20821 //
20822 // is
20823 //
20824 // (Value >= Thresh) << 63
20825 //
20826 // but since we can get here after LegalOperations, DAGCombine might do the
20827 // wrong thing if we create a select. So, directly create the preferred
20828 // version.
20829 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20830 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20831 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20832
20833 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20834 DAG.getConstantFP(0.0, DL, TheVT));
20835
20836 if (IsStrict) {
20837 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20838 { Chain, Value, FltOfs });
20839 Chain = Value.getValue(1);
20840 } else
20841 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20842 }
20843
20844 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
20845
20846 // FIXME This causes a redundant load/store if the SSE-class value is already
20847 // in memory, such as if it is on the callstack.
20848 if (isScalarFPTypeInSSEReg(TheVT)) {
20849 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20850 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20851 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20852 SDValue Ops[] = { Chain, StackSlot };
20853
20854 unsigned FLDSize = TheVT.getStoreSize();
20855 assert(FLDSize <= MemSize && "Stack slot not big enough");
20856 MachineMemOperand *MMO = MF.getMachineMemOperand(
20857 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20858 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20859 Chain = Value.getValue(1);
20860 }
20861
20862 // Build the FP_TO_INT*_IN_MEM
20863 MachineMemOperand *MMO = MF.getMachineMemOperand(
20864 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20865 SDValue Ops[] = { Chain, Value, StackSlot };
20867 DAG.getVTList(MVT::Other),
20868 Ops, DstTy, MMO);
20869
20870 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
20871 Chain = Res.getValue(1);
20872
20873 // If we need an unsigned fixup, XOR the result with adjust.
20874 if (UnsignedFixup)
20875 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20876
20877 return Res;
20878}
20879
20881 const X86Subtarget &Subtarget) {
20882 MVT VT = Op.getSimpleValueType();
20883 SDValue In = Op.getOperand(0);
20884 MVT InVT = In.getSimpleValueType();
20885 unsigned Opc = Op.getOpcode();
20886
20887 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20889 "Unexpected extension opcode");
20891 "Expected same number of elements");
20892 assert((VT.getVectorElementType() == MVT::i16 ||
20893 VT.getVectorElementType() == MVT::i32 ||
20894 VT.getVectorElementType() == MVT::i64) &&
20895 "Unexpected element type");
20896 assert((InVT.getVectorElementType() == MVT::i8 ||
20897 InVT.getVectorElementType() == MVT::i16 ||
20898 InVT.getVectorElementType() == MVT::i32) &&
20899 "Unexpected element type");
20900
20901 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
20902
20903 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20904 assert(InVT == MVT::v32i8 && "Unexpected VT!");
20905 return splitVectorIntUnary(Op, DAG, dl);
20906 }
20907
20908 if (Subtarget.hasInt256())
20909 return Op;
20910
20911 // Optimize vectors in AVX mode:
20912 //
20913 // v8i16 -> v8i32
20914 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20915 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20916 // Concat upper and lower parts.
20917 //
20918 // v4i32 -> v4i64
20919 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20920 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20921 // Concat upper and lower parts.
20922 //
20923 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20924 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20925
20926 // Short-circuit if we can determine that each 128-bit half is the same value.
20927 // Otherwise, this is difficult to match and optimize.
20928 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20929 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20930 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20931
20932 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20933 SDValue Undef = DAG.getUNDEF(InVT);
20934 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20935 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20936 OpHi = DAG.getBitcast(HalfVT, OpHi);
20937
20938 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20939}
20940
20941// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20942static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20943 const SDLoc &dl, SelectionDAG &DAG) {
20944 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20945 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20946 DAG.getVectorIdxConstant(0, dl));
20947 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20948 DAG.getVectorIdxConstant(8, dl));
20949 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20950 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20951 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20952 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20953}
20954
20956 const X86Subtarget &Subtarget,
20957 SelectionDAG &DAG) {
20958 MVT VT = Op->getSimpleValueType(0);
20959 SDValue In = Op->getOperand(0);
20960 MVT InVT = In.getSimpleValueType();
20961 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20962 unsigned NumElts = VT.getVectorNumElements();
20963
20964 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20965 // avoids a constant pool load.
20966 if (VT.getVectorElementType() != MVT::i8) {
20967 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20968 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20969 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20970 }
20971
20972 // Extend VT if BWI is not supported.
20973 MVT ExtVT = VT;
20974 if (!Subtarget.hasBWI()) {
20975 // If v16i32 is to be avoided, we'll need to split and concatenate.
20976 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
20977 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
20978
20979 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
20980 }
20981
20982 // Widen to 512-bits if VLX is not supported.
20983 MVT WideVT = ExtVT;
20984 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
20985 NumElts *= 512 / ExtVT.getSizeInBits();
20986 InVT = MVT::getVectorVT(MVT::i1, NumElts);
20987 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In,
20988 DAG.getVectorIdxConstant(0, DL));
20989 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
20990 }
20991
20992 SDValue One = DAG.getConstant(1, DL, WideVT);
20993 SDValue Zero = DAG.getConstant(0, DL, WideVT);
20994
20995 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
20996
20997 // Truncate if we had to extend above.
20998 if (VT != ExtVT) {
20999 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
21000 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
21001 }
21002
21003 // Extract back to 128/256-bit if we widened.
21004 if (WideVT != VT)
21005 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
21006 DAG.getVectorIdxConstant(0, DL));
21007
21008 return SelectedVal;
21009}
21010
21012 SelectionDAG &DAG) {
21013 SDValue In = Op.getOperand(0);
21014 MVT SVT = In.getSimpleValueType();
21015 SDLoc DL(Op);
21016
21017 if (SVT.getVectorElementType() == MVT::i1)
21018 return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG);
21019
21020 assert(Subtarget.hasAVX() && "Expected AVX support");
21021 return LowerAVXExtend(Op, DL, DAG, Subtarget);
21022}
21023
21024/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
21025/// It makes use of the fact that vectors with enough leading sign/zero bits
21026/// prevent the PACKSS/PACKUS from saturating the results.
21027/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
21028/// within each 128-bit lane.
21029static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
21030 const SDLoc &DL, SelectionDAG &DAG,
21031 const X86Subtarget &Subtarget) {
21032 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
21033 "Unexpected PACK opcode");
21034 assert(DstVT.isVector() && "VT not a vector?");
21035
21036 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
21037 if (!Subtarget.hasSSE2())
21038 return SDValue();
21039
21040 EVT SrcVT = In.getValueType();
21041
21042 // No truncation required, we might get here due to recursive calls.
21043 if (SrcVT == DstVT)
21044 return In;
21045
21046 unsigned NumElems = SrcVT.getVectorNumElements();
21047 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
21048 return SDValue();
21049
21050 unsigned DstSizeInBits = DstVT.getSizeInBits();
21051 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
21052 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
21053 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
21054
21055 LLVMContext &Ctx = *DAG.getContext();
21056 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
21057 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21058
21059 // Pack to the largest type possible:
21060 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
21061 EVT InVT = MVT::i16, OutVT = MVT::i8;
21062 if (SrcVT.getScalarSizeInBits() > 16 &&
21063 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
21064 InVT = MVT::i32;
21065 OutVT = MVT::i16;
21066 }
21067
21068 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
21069 // On pre-AVX512, pack the src in both halves to help value tracking.
21070 if (SrcSizeInBits <= 128) {
21071 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
21072 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
21073 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
21074 SDValue LHS = DAG.getBitcast(InVT, In);
21075 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
21076 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
21077 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
21078 Res = DAG.getBitcast(PackedVT, Res);
21079 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21080 }
21081
21082 // Split lower/upper subvectors.
21083 SDValue Lo, Hi;
21084 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
21085
21086 // If Hi is undef, then don't bother packing it and widen the result instead.
21087 if (Hi.isUndef()) {
21088 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
21089 if (SDValue Res =
21090 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
21091 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
21092 }
21093
21094 unsigned SubSizeInBits = SrcSizeInBits / 2;
21095 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
21096 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
21097
21098 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
21099 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
21100 Lo = DAG.getBitcast(InVT, Lo);
21101 Hi = DAG.getBitcast(InVT, Hi);
21102 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21103 return DAG.getBitcast(DstVT, Res);
21104 }
21105
21106 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
21107 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
21108 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
21109 Lo = DAG.getBitcast(InVT, Lo);
21110 Hi = DAG.getBitcast(InVT, Hi);
21111 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21112
21113 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
21114 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
21115 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
21117 int Scale = 64 / OutVT.getScalarSizeInBits();
21118 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
21119 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
21120
21121 if (DstVT.is256BitVector())
21122 return DAG.getBitcast(DstVT, Res);
21123
21124 // If 512bit -> 128bit truncate another stage.
21125 Res = DAG.getBitcast(PackedVT, Res);
21126 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21127 }
21128
21129 // Recursively pack lower/upper subvectors, concat result and pack again.
21130 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
21131
21132 if (PackedVT.is128BitVector()) {
21133 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
21134 // type legalization.
21135 SDValue Res =
21136 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
21137 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21138 }
21139
21140 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
21141 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
21142 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
21143 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
21144 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21145}
21146
21147/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
21148/// e.g. trunc <8 x i32> X to <8 x i16> -->
21149/// MaskX = X & 0xffff (clear high bits to prevent saturation)
21150/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
21152 const X86Subtarget &Subtarget,
21153 SelectionDAG &DAG) {
21154 In = DAG.getZeroExtendInReg(In, DL, DstVT);
21155 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
21156}
21157
21158/// Truncate using inreg sign extension and X86ISD::PACKSS.
21160 const X86Subtarget &Subtarget,
21161 SelectionDAG &DAG) {
21162 EVT SrcVT = In.getValueType();
21163 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
21164 DAG.getValueType(DstVT));
21165 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
21166}
21167
21168/// Helper to determine if \p In truncated to \p DstVT has the necessary
21169/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
21170/// possibly by converting a SRL node to SRA for sign extension.
21171static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
21172 SDValue In, const SDLoc &DL,
21173 SelectionDAG &DAG,
21174 const X86Subtarget &Subtarget,
21175 const SDNodeFlags Flags = SDNodeFlags()) {
21176 // Requires SSE2.
21177 if (!Subtarget.hasSSE2())
21178 return SDValue();
21179
21180 EVT SrcVT = In.getValueType();
21181 EVT DstSVT = DstVT.getVectorElementType();
21182 EVT SrcSVT = SrcVT.getVectorElementType();
21183 unsigned NumDstEltBits = DstSVT.getSizeInBits();
21184 unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
21185
21186 // Check we have a truncation suited for PACKSS/PACKUS.
21187 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21188 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21189 return SDValue();
21190
21191 assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
21192 unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
21193
21194 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
21195 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
21196 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
21197 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
21198 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
21199 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
21200 return SDValue();
21201
21202 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
21203 // split this for packing.
21204 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
21205 !isFreeToSplitVector(In, DAG) &&
21206 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
21207 return SDValue();
21208
21209 // Don't truncate AVX512 targets as multiple PACK nodes stages.
21210 if (Subtarget.hasAVX512() && NumStages > 1)
21211 return SDValue();
21212
21213 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
21214 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21215
21216 // Truncate with PACKUS if we are truncating a vector with leading zero
21217 // bits that extend all the way to the packed/truncated value.
21218 // e.g. Masks, zext_in_reg, etc.
21219 // Pre-SSE41 we can only use PACKUSWB.
21220 KnownBits Known = DAG.computeKnownBits(In);
21221 if ((Flags.hasNoUnsignedWrap() && NumDstEltBits <= NumPackedZeroBits) ||
21222 (NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
21223 PackOpcode = X86ISD::PACKUS;
21224 return In;
21225 }
21226
21227 // Truncate with PACKSS if we are truncating a vector with sign-bits
21228 // that extend all the way to the packed/truncated value.
21229 // e.g. Comparison result, sext_in_reg, etc.
21230 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
21231
21232 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
21233 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
21234 // see through BITCASTs later on and combines/simplifications can't then use
21235 // it.
21236 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
21237 !Subtarget.hasAVX512())
21238 return SDValue();
21239
21240 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
21241 if ((Flags.hasNoSignedWrap() && DstSVT != MVT::i32) ||
21242 MinSignBits < NumSignBits) {
21243 PackOpcode = X86ISD::PACKSS;
21244 return In;
21245 }
21246
21247 // If we have a srl that only generates signbits that we will discard in
21248 // the truncation then we can use PACKSS by converting the srl to a sra.
21249 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
21250 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
21251 if (std::optional<unsigned> ShAmt = DAG.getValidShiftAmount(In)) {
21252 if (*ShAmt == MinSignBits) {
21253 PackOpcode = X86ISD::PACKSS;
21254 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
21255 }
21256 }
21257
21258 return SDValue();
21259}
21260
21261/// This function lowers a vector truncation of 'extended sign-bits' or
21262/// 'extended zero-bits' values.
21263/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
21265 MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget,
21266 SelectionDAG &DAG, const SDNodeFlags Flags = SDNodeFlags()) {
21267 MVT SrcVT = In.getSimpleValueType();
21268 MVT DstSVT = DstVT.getVectorElementType();
21269 MVT SrcSVT = SrcVT.getVectorElementType();
21270 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21271 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21272 return SDValue();
21273
21274 // If the upper half of the source is undef, then attempt to split and
21275 // only truncate the lower half.
21276 if (DstVT.getSizeInBits() >= 128) {
21277 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21278 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21279 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
21280 Subtarget, DAG))
21281 return widenSubVector(Res, false, Subtarget, DAG, DL,
21282 DstVT.getSizeInBits());
21283 }
21284 }
21285
21286 unsigned PackOpcode;
21287 if (SDValue Src = matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG,
21288 Subtarget, Flags))
21289 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
21290
21291 return SDValue();
21292}
21293
21294/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
21295/// X86ISD::PACKUS/X86ISD::PACKSS operations.
21297 const X86Subtarget &Subtarget,
21298 SelectionDAG &DAG) {
21299 MVT SrcVT = In.getSimpleValueType();
21300 MVT DstSVT = DstVT.getVectorElementType();
21301 MVT SrcSVT = SrcVT.getVectorElementType();
21302 unsigned NumElems = DstVT.getVectorNumElements();
21303 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21304 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
21305 NumElems >= 8))
21306 return SDValue();
21307
21308 // SSSE3's pshufb results in less instructions in the cases below.
21309 if (Subtarget.hasSSSE3() && NumElems == 8) {
21310 if (SrcSVT == MVT::i16)
21311 return SDValue();
21312 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
21313 return SDValue();
21314 }
21315
21316 // If the upper half of the source is undef, then attempt to split and
21317 // only truncate the lower half.
21318 if (DstVT.getSizeInBits() >= 128) {
21319 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21320 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21321 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
21322 return widenSubVector(Res, false, Subtarget, DAG, DL,
21323 DstVT.getSizeInBits());
21324 }
21325 }
21326
21327 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
21328 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
21329 // truncate 2 x v4i32 to v8i16.
21330 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
21331 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
21332
21333 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
21334 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
21335
21336 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
21337 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
21338 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
21339 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
21340 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
21341 }
21342
21343 return SDValue();
21344}
21345
21347 SelectionDAG &DAG,
21348 const X86Subtarget &Subtarget) {
21349 MVT VT = Op.getSimpleValueType();
21350 SDValue In = Op.getOperand(0);
21351 MVT InVT = In.getSimpleValueType();
21352 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
21353
21354 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21355 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21356 if (InVT.getScalarSizeInBits() <= 16) {
21357 if (Subtarget.hasBWI()) {
21358 // legal, will go to VPMOVB2M, VPMOVW2M
21359 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21360 // We need to shift to get the lsb into sign position.
21361 // Shift packed bytes not supported natively, bitcast to word
21362 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21363 In = DAG.getNode(ISD::SHL, DL, ExtVT,
21364 DAG.getBitcast(ExtVT, In),
21365 DAG.getConstant(ShiftInx, DL, ExtVT));
21366 In = DAG.getBitcast(InVT, In);
21367 }
21368 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21369 In, ISD::SETGT);
21370 }
21371 // Use TESTD/Q, extended vector to packed dword/qword.
21372 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
21373 "Unexpected vector type.");
21374 unsigned NumElts = InVT.getVectorNumElements();
21375 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
21376 // We need to change to a wider element type that we have support for.
21377 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21378 // For 16 element vectors we extend to v16i32 unless we are explicitly
21379 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21380 // we need to split into two 8 element vectors which we can extend to v8i32,
21381 // truncate and concat the results. There's an additional complication if
21382 // the original type is v16i8. In that case we can't split the v16i8
21383 // directly, so we need to shuffle high elements to low and use
21384 // sign_extend_vector_inreg.
21385 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21386 SDValue Lo, Hi;
21387 if (InVT == MVT::v16i8) {
21388 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21389 Hi = DAG.getVectorShuffle(
21390 InVT, DL, In, In,
21391 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21392 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21393 } else {
21394 assert(InVT == MVT::v16i16 && "Unexpected VT!");
21395 Lo = extract128BitVector(In, 0, DAG, DL);
21396 Hi = extract128BitVector(In, 8, DAG, DL);
21397 }
21398 // We're split now, just emit two truncates and a concat. The two
21399 // truncates will trigger legalization to come back to this function.
21400 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21401 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21402 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21403 }
21404 // We either have 8 elements or we're allowed to use 512-bit vectors.
21405 // If we have VLX, we want to use the narrowest vector that can get the
21406 // job done so we use vXi32.
21407 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21408 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21409 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21410 InVT = ExtVT;
21411 ShiftInx = InVT.getScalarSizeInBits() - 1;
21412 }
21413
21414 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21415 // We need to shift to get the lsb into sign position.
21416 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21417 DAG.getConstant(ShiftInx, DL, InVT));
21418 }
21419 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21420 if (Subtarget.hasDQI())
21421 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21422 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21423}
21424
21425SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21426 SDLoc DL(Op);
21427 MVT VT = Op.getSimpleValueType();
21428 SDValue In = Op.getOperand(0);
21429 MVT InVT = In.getSimpleValueType();
21431 "Invalid TRUNCATE operation");
21432
21433 // If we're called by the type legalizer, handle a few cases.
21434 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21435 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
21436 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21437 VT.is128BitVector() && Subtarget.hasAVX512()) {
21438 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21439 "Unexpected subtarget!");
21440 // The default behavior is to truncate one step, concatenate, and then
21441 // truncate the remainder. We'd rather produce two 64-bit results and
21442 // concatenate those.
21443 SDValue Lo, Hi;
21444 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21445
21446 EVT LoVT, HiVT;
21447 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21448
21449 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21450 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21451 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21452 }
21453
21454 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
21455 if (!Subtarget.hasAVX512() ||
21456 (InVT.is512BitVector() && VT.is256BitVector()))
21458 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21459 return SignPack;
21460
21461 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
21462 if (!Subtarget.hasAVX512())
21463 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
21464
21465 // Otherwise let default legalization handle it.
21466 return SDValue();
21467 }
21468
21469 if (VT.getVectorElementType() == MVT::i1)
21470 return LowerTruncateVecI1(Op, DL, DAG, Subtarget);
21471
21472 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
21473 // concat from subvectors to use VPTRUNC etc.
21474 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In, DAG))
21476 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21477 return SignPack;
21478
21479 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21480 if (Subtarget.hasAVX512()) {
21481 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21482 assert(VT == MVT::v32i8 && "Unexpected VT!");
21483 return splitVectorIntUnary(Op, DAG, DL);
21484 }
21485
21486 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21487 // and then truncate that. But we should only do that if we haven't been
21488 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21489 // handled by isel patterns.
21490 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21491 Subtarget.canExtendTo512DQ())
21492 return Op;
21493 }
21494
21495 // Handle truncation of V256 to V128 using shuffles.
21496 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
21497
21498 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21499 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21500 if (Subtarget.hasInt256()) {
21501 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21502 In = DAG.getBitcast(MVT::v8i32, In);
21503 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21504 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21505 DAG.getVectorIdxConstant(0, DL));
21506 }
21507
21508 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21509 DAG.getVectorIdxConstant(0, DL));
21510 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21511 DAG.getVectorIdxConstant(2, DL));
21512 static const int ShufMask[] = {0, 2, 4, 6};
21513 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
21514 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
21515 }
21516
21517 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21518 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21519 if (Subtarget.hasInt256()) {
21520 // The PSHUFB mask:
21521 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21522 -1, -1, -1, -1, -1, -1, -1, -1,
21523 16, 17, 20, 21, 24, 25, 28, 29,
21524 -1, -1, -1, -1, -1, -1, -1, -1 };
21525 In = DAG.getBitcast(MVT::v32i8, In);
21526 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21527 In = DAG.getBitcast(MVT::v4i64, In);
21528
21529 static const int ShufMask2[] = {0, 2, -1, -1};
21530 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21531 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21532 DAG.getVectorIdxConstant(0, DL));
21533 return DAG.getBitcast(MVT::v8i16, In);
21534 }
21535
21536 return Subtarget.hasSSE41()
21537 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
21538 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
21539 }
21540
21541 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
21542 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
21543
21544 llvm_unreachable("All 256->128 cases should have been handled above!");
21545}
21546
21547// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21548// behaves on out of range inputs to generate optimized conversions.
21550 SelectionDAG &DAG,
21551 const X86Subtarget &Subtarget) {
21552 MVT SrcVT = Src.getSimpleValueType();
21553 unsigned DstBits = VT.getScalarSizeInBits();
21554 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
21555
21556 // Calculate the converted result for values in the range 0 to
21557 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21558 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21559 SDValue Big =
21560 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21561 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21562 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21563
21564 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21565 // and only if the value was out of range. So we can use that
21566 // as our indicator that we rather use "Big" instead of "Small".
21567 //
21568 // Use "Small" if "IsOverflown" has all bits cleared
21569 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21570
21571 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21572 // use the slightly slower blendv select instead.
21573 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21574 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21575 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21576 }
21577
21578 SDValue IsOverflown =
21579 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21580 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21581 return DAG.getNode(ISD::OR, dl, VT, Small,
21582 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21583}
21584
21585SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21586 bool IsStrict = Op->isStrictFPOpcode();
21587 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21588 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21589 bool HasVLX = Subtarget.hasVLX();
21590 MVT VT = Op->getSimpleValueType(0);
21591 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21592 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
21593 MVT SrcVT = Src.getSimpleValueType();
21594 SDLoc dl(Op);
21595
21596 SDValue Res;
21597 if (isSoftF16(SrcVT, Subtarget)) {
21598 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
21599 if (IsStrict)
21600 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
21601 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
21602 {NVT, MVT::Other}, {Chain, Src})});
21603 return DAG.getNode(Op.getOpcode(), dl, VT,
21604 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
21605 } else if (isTypeLegal(SrcVT) &&
21606 isLegalConversion(VT, SrcVT, IsSigned, Subtarget)) {
21607 return Op;
21608 }
21609
21610 if (VT.isVector()) {
21611 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21612 MVT ResVT = MVT::v4i32;
21613 MVT TruncVT = MVT::v4i1;
21614 unsigned Opc;
21615 if (IsStrict)
21617 else
21618 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21619
21620 if (!IsSigned && !HasVLX) {
21621 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21622 // Widen to 512-bits.
21623 ResVT = MVT::v8i32;
21624 TruncVT = MVT::v8i1;
21625 Opc = Op.getOpcode();
21626 // Need to concat with zero vector for strict fp to avoid spurious
21627 // exceptions.
21628 // TODO: Should we just do this for non-strict as well?
21629 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21630 : DAG.getUNDEF(MVT::v8f64);
21631 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21632 DAG.getVectorIdxConstant(0, dl));
21633 }
21634 if (IsStrict) {
21635 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
21636 Chain = Res.getValue(1);
21637 } else {
21638 Res = DAG.getNode(Opc, dl, ResVT, Src);
21639 }
21640
21641 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21642 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21643 DAG.getVectorIdxConstant(0, dl));
21644 if (IsStrict)
21645 return DAG.getMergeValues({Res, Chain}, dl);
21646 return Res;
21647 }
21648
21649 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
21650 if ((HasVLX && (VT == MVT::v8i16 || VT == MVT::v16i16)) ||
21651 VT == MVT::v32i16)
21652 return Op;
21653
21654 MVT ResVT = VT;
21655 MVT EleVT = VT.getVectorElementType();
21656 if (EleVT != MVT::i64)
21657 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
21658
21659 if (SrcVT == MVT::v2f16 || SrcVT == MVT::v4f16) {
21660 SDValue Tmp =
21661 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
21662 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
21663 Ops[0] = Src;
21664 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
21665 }
21666
21667 if (!HasVLX) {
21668 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21669 // Widen to 512-bits.
21670 unsigned IntSize = EleVT.getSizeInBits();
21671 unsigned Num = IntSize > 16 ? 512 / IntSize : 32;
21672 ResVT = MVT::getVectorVT(EleVT, Num);
21673 Src = widenSubVector(MVT::getVectorVT(MVT::f16, Num), Src, IsStrict,
21674 Subtarget, DAG, dl);
21675 }
21676
21677 if (IsStrict) {
21678 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
21680 dl, {ResVT, MVT::Other}, {Chain, Src});
21681 Chain = Res.getValue(1);
21682 } else {
21683 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
21684 ResVT, Src);
21685 }
21686
21687 // TODO: Need to add exception check code for strict FP.
21688 if (EleVT.getSizeInBits() < 16) {
21689 if (HasVLX)
21690 ResVT = MVT::getVectorVT(EleVT, 8);
21691 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
21692 }
21693
21694 if (ResVT != VT)
21695 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21696 DAG.getVectorIdxConstant(0, dl));
21697
21698 if (IsStrict)
21699 return DAG.getMergeValues({Res, Chain}, dl);
21700 return Res;
21701 }
21702
21703 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
21704 if (VT.getVectorElementType() == MVT::i16) {
21705 assert((SrcVT.getVectorElementType() == MVT::f32 ||
21706 SrcVT.getVectorElementType() == MVT::f64) &&
21707 "Expected f32/f64 vector!");
21708 MVT NVT = VT.changeVectorElementType(MVT::i32);
21709 if (IsStrict) {
21710 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
21712 dl, {NVT, MVT::Other}, {Chain, Src});
21713 Chain = Res.getValue(1);
21714 } else {
21715 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
21716 NVT, Src);
21717 }
21718
21719 // TODO: Need to add exception check code for strict FP.
21720 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21721
21722 if (IsStrict)
21723 return DAG.getMergeValues({Res, Chain}, dl);
21724 return Res;
21725 }
21726
21727 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21728 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21729 assert(!IsSigned && "Expected unsigned conversion!");
21730 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
21731 return Op;
21732 }
21733
21734 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21735 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21736 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21737 Subtarget.useAVX512Regs()) {
21738 assert(!IsSigned && "Expected unsigned conversion!");
21739 assert(!Subtarget.hasVLX() && "Unexpected features!");
21740 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21741 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21742 // Need to concat with zero vector for strict fp to avoid spurious
21743 // exceptions.
21744 // TODO: Should we just do this for non-strict as well?
21745 SDValue Tmp =
21746 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21747 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21748 DAG.getVectorIdxConstant(0, dl));
21749
21750 if (IsStrict) {
21751 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21752 {Chain, Src});
21753 Chain = Res.getValue(1);
21754 } else {
21755 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21756 }
21757
21758 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21759 DAG.getVectorIdxConstant(0, dl));
21760
21761 if (IsStrict)
21762 return DAG.getMergeValues({Res, Chain}, dl);
21763 return Res;
21764 }
21765
21766 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21767 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21768 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21769 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21770 assert(!Subtarget.hasVLX() && "Unexpected features!");
21771 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21772 // Need to concat with zero vector for strict fp to avoid spurious
21773 // exceptions.
21774 // TODO: Should we just do this for non-strict as well?
21775 SDValue Tmp =
21776 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21777 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21778 DAG.getVectorIdxConstant(0, dl));
21779
21780 if (IsStrict) {
21781 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21782 {Chain, Src});
21783 Chain = Res.getValue(1);
21784 } else {
21785 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21786 }
21787
21788 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21789 DAG.getVectorIdxConstant(0, dl));
21790
21791 if (IsStrict)
21792 return DAG.getMergeValues({Res, Chain}, dl);
21793 return Res;
21794 }
21795
21796 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21797 if (!Subtarget.hasVLX()) {
21798 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21799 // legalizer and then widened again by vector op legalization.
21800 if (!IsStrict)
21801 return SDValue();
21802
21803 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21804 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21805 {Src, Zero, Zero, Zero});
21806 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21807 {Chain, Tmp});
21808 SDValue Chain = Tmp.getValue(1);
21809 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21810 DAG.getVectorIdxConstant(0, dl));
21811 return DAG.getMergeValues({Tmp, Chain}, dl);
21812 }
21813
21814 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21815 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21816 DAG.getUNDEF(MVT::v2f32));
21817 if (IsStrict) {
21818 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21820 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21821 }
21822 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21823 return DAG.getNode(Opc, dl, VT, Tmp);
21824 }
21825
21826 // Generate optimized instructions for pre AVX512 unsigned conversions from
21827 // vXf32 to vXi32.
21828 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21829 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21830 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21831 assert(!IsSigned && "Expected unsigned conversion!");
21832 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21833 }
21834
21835 return SDValue();
21836 }
21837
21838 assert(!VT.isVector());
21839
21840 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21841
21842 if (!IsSigned && UseSSEReg) {
21843 // Conversions from f32/f64 with AVX512 should be legal.
21844 if (Subtarget.hasAVX512())
21845 return Op;
21846
21847 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21848 // behaves on out of range inputs to generate optimized conversions.
21849 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21850 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21851 unsigned DstBits = VT.getScalarSizeInBits();
21852 APInt UIntLimit = APInt::getSignMask(DstBits);
21853 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21854 DAG.getConstant(UIntLimit, dl, VT));
21855 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21856
21857 // Calculate the converted result for values in the range:
21858 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21859 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21860 SDValue Small =
21861 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21862 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21863 SDValue Big = DAG.getNode(
21864 X86ISD::CVTTS2SI, dl, VT,
21865 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21866 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21867
21868 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21869 // and only if the value was out of range. So we can use that
21870 // as our indicator that we rather use "Big" instead of "Small".
21871 //
21872 // Use "Small" if "IsOverflown" has all bits cleared
21873 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21874 SDValue IsOverflown = DAG.getNode(
21875 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21876 return DAG.getNode(ISD::OR, dl, VT, Small,
21877 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21878 }
21879
21880 // Use default expansion for i64.
21881 if (VT == MVT::i64)
21882 return SDValue();
21883
21884 assert(VT == MVT::i32 && "Unexpected VT!");
21885
21886 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21887 // FIXME: This does not generate an invalid exception if the input does not
21888 // fit in i32. PR44019
21889 if (Subtarget.is64Bit()) {
21890 if (IsStrict) {
21891 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21892 {Chain, Src});
21893 Chain = Res.getValue(1);
21894 } else
21895 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21896
21897 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21898 if (IsStrict)
21899 return DAG.getMergeValues({Res, Chain}, dl);
21900 return Res;
21901 }
21902
21903 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21904 // use fisttp which will be handled later.
21905 if (!Subtarget.hasSSE3())
21906 return SDValue();
21907 }
21908
21909 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21910 // FIXME: This does not generate an invalid exception if the input does not
21911 // fit in i16. PR44019
21912 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21913 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21914 if (IsStrict) {
21915 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21916 {Chain, Src});
21917 Chain = Res.getValue(1);
21918 } else
21919 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21920
21921 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21922 if (IsStrict)
21923 return DAG.getMergeValues({Res, Chain}, dl);
21924 return Res;
21925 }
21926
21927 // If this is a FP_TO_SINT using SSEReg we're done.
21928 if (UseSSEReg && IsSigned)
21929 return Op;
21930
21931 // fp128 needs to use a libcall.
21932 if (SrcVT == MVT::f128) {
21933 RTLIB::Libcall LC;
21934 if (IsSigned)
21935 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21936 else
21937 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21938
21939 MakeLibCallOptions CallOptions;
21940 std::pair<SDValue, SDValue> Tmp =
21941 makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
21942
21943 if (IsStrict)
21944 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21945
21946 return Tmp.first;
21947 }
21948
21949 // Fall back to X87.
21950 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21951 if (IsStrict)
21952 return DAG.getMergeValues({V, Chain}, dl);
21953 return V;
21954 }
21955
21956 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21957}
21958
21959SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21960 SelectionDAG &DAG) const {
21961 SDValue Src = Op.getOperand(0);
21962 EVT DstVT = Op.getSimpleValueType();
21963 MVT SrcVT = Src.getSimpleValueType();
21964
21965 if (SrcVT.isVector())
21966 return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
21967
21968 if (SrcVT == MVT::f16)
21969 return SDValue();
21970
21971 // If the source is in an SSE register, the node is Legal.
21972 if (isScalarFPTypeInSSEReg(SrcVT))
21973 return Op;
21974
21975 return LRINT_LLRINTHelper(Op.getNode(), DAG);
21976}
21977
21978SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
21979 SelectionDAG &DAG) const {
21980 EVT DstVT = N->getValueType(0);
21981 SDValue Src = N->getOperand(0);
21982 EVT SrcVT = Src.getValueType();
21983
21984 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21985 // f16 must be promoted before using the lowering in this routine.
21986 // fp128 does not use this lowering.
21987 return SDValue();
21988 }
21989
21990 SDLoc DL(N);
21991 SDValue Chain = DAG.getEntryNode();
21992
21993 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
21994
21995 // If we're converting from SSE, the stack slot needs to hold both types.
21996 // Otherwise it only needs to hold the DstVT.
21997 EVT OtherVT = UseSSE ? SrcVT : DstVT;
21998 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
21999 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
22000 MachinePointerInfo MPI =
22002
22003 if (UseSSE) {
22004 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
22005 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
22006 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22007 SDValue Ops[] = { Chain, StackPtr };
22008
22009 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
22010 /*Align*/ std::nullopt,
22012 Chain = Src.getValue(1);
22013 }
22014
22015 SDValue StoreOps[] = { Chain, Src, StackPtr };
22016 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
22017 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
22019
22020 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
22021}
22022
22023SDValue
22024X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
22025 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
22026 // but making use of X86 specifics to produce better instruction sequences.
22027 SDNode *Node = Op.getNode();
22028 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
22029 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
22030 SDLoc dl(SDValue(Node, 0));
22031 SDValue Src = Node->getOperand(0);
22032
22033 // There are three types involved here: SrcVT is the source floating point
22034 // type, DstVT is the type of the result, and TmpVT is the result of the
22035 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
22036 // DstVT).
22037 EVT SrcVT = Src.getValueType();
22038 EVT DstVT = Node->getValueType(0);
22039 EVT TmpVT = DstVT;
22040
22041 // This code is only for floats and doubles. Fall back to generic code for
22042 // anything else.
22043 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
22044 return SDValue();
22045
22046 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
22047 unsigned SatWidth = SatVT.getScalarSizeInBits();
22048 unsigned DstWidth = DstVT.getScalarSizeInBits();
22049 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
22050 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
22051 "Expected saturation width smaller than result width");
22052
22053 // Promote result of FP_TO_*INT to at least 32 bits.
22054 if (TmpWidth < 32) {
22055 TmpVT = MVT::i32;
22056 TmpWidth = 32;
22057 }
22058
22059 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
22060 // us to use a native signed conversion instead.
22061 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
22062 TmpVT = MVT::i64;
22063 TmpWidth = 64;
22064 }
22065
22066 // If the saturation width is smaller than the size of the temporary result,
22067 // we can always use signed conversion, which is native.
22068 if (SatWidth < TmpWidth)
22069 FpToIntOpcode = ISD::FP_TO_SINT;
22070
22071 // Determine minimum and maximum integer values and their corresponding
22072 // floating-point values.
22073 APInt MinInt, MaxInt;
22074 if (IsSigned) {
22075 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
22076 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
22077 } else {
22078 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
22079 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
22080 }
22081
22082 const fltSemantics &Sem = SrcVT.getFltSemantics();
22083 APFloat MinFloat(Sem);
22084 APFloat MaxFloat(Sem);
22085
22086 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
22087 MinInt, IsSigned, APFloat::rmTowardZero);
22088 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
22089 MaxInt, IsSigned, APFloat::rmTowardZero);
22090 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
22091 && !(MaxStatus & APFloat::opStatus::opInexact);
22092
22093 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
22094 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
22095
22096 // If the integer bounds are exactly representable as floats, emit a
22097 // min+max+fptoi sequence. Otherwise use comparisons and selects.
22098 if (AreExactFloatBounds) {
22099 if (DstVT != TmpVT) {
22100 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
22101 SDValue MinClamped = DAG.getNode(
22102 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
22103 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
22104 SDValue BothClamped = DAG.getNode(
22105 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
22106 // Convert clamped value to integer.
22107 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
22108
22109 // NaN will become INDVAL, with the top bit set and the rest zero.
22110 // Truncation will discard the top bit, resulting in zero.
22111 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22112 }
22113
22114 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
22115 SDValue MinClamped = DAG.getNode(
22116 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
22117 // Clamp by MaxFloat from above. NaN cannot occur.
22118 SDValue BothClamped = DAG.getNode(
22119 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
22120 // Convert clamped value to integer.
22121 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
22122
22123 if (!IsSigned) {
22124 // In the unsigned case we're done, because we mapped NaN to MinFloat,
22125 // which is zero.
22126 return FpToInt;
22127 }
22128
22129 // Otherwise, select zero if Src is NaN.
22130 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22131 return DAG.getSelectCC(
22132 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
22133 }
22134
22135 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
22136 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
22137
22138 // Result of direct conversion, which may be selected away.
22139 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
22140
22141 if (DstVT != TmpVT) {
22142 // NaN will become INDVAL, with the top bit set and the rest zero.
22143 // Truncation will discard the top bit, resulting in zero.
22144 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22145 }
22146
22147 SDValue Select = FpToInt;
22148 // For signed conversions where we saturate to the same size as the
22149 // result type of the fptoi instructions, INDVAL coincides with integer
22150 // minimum, so we don't need to explicitly check it.
22151 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
22152 // If Src ULT MinFloat, select MinInt. In particular, this also selects
22153 // MinInt if Src is NaN.
22154 Select = DAG.getSelectCC(
22155 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
22156 }
22157
22158 // If Src OGT MaxFloat, select MaxInt.
22159 Select = DAG.getSelectCC(
22160 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
22161
22162 // In the unsigned case we are done, because we mapped NaN to MinInt, which
22163 // is already zero. The promoted case was already handled above.
22164 if (!IsSigned || DstVT != TmpVT) {
22165 return Select;
22166 }
22167
22168 // Otherwise, select 0 if Src is NaN.
22169 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22170 return DAG.getSelectCC(
22171 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
22172}
22173
22174SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
22175 bool IsStrict = Op->isStrictFPOpcode();
22176
22177 SDLoc DL(Op);
22178 MVT VT = Op.getSimpleValueType();
22179 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22180 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22181 MVT SVT = In.getSimpleValueType();
22182
22183 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
22184 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
22185 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
22186 !Subtarget.getTargetTriple().isOSDarwin()))
22187 return SDValue();
22188
22189 if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
22190 (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
22191 return Op;
22192
22193 if (SVT == MVT::f16) {
22194 if (Subtarget.hasFP16())
22195 return Op;
22196
22197 if (VT != MVT::f32) {
22198 if (IsStrict)
22199 return DAG.getNode(
22200 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
22201 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
22202 {MVT::f32, MVT::Other}, {Chain, In})});
22203
22204 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
22205 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
22206 }
22207
22208 if (!Subtarget.hasF16C()) {
22209 if (!Subtarget.getTargetTriple().isOSDarwin())
22210 return SDValue();
22211
22212 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
22213
22214 // Need a libcall, but ABI for f16 is soft-float on MacOS.
22215 TargetLowering::CallLoweringInfo CLI(DAG);
22216 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22217
22218 In = DAG.getBitcast(MVT::i16, In);
22220 TargetLowering::ArgListEntry Entry(
22221 In, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()));
22222 Entry.IsSExt = false;
22223 Entry.IsZExt = true;
22224 Args.push_back(Entry);
22225
22227 getLibcallName(RTLIB::FPEXT_F16_F32),
22229 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22230 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
22231 std::move(Args));
22232
22233 SDValue Res;
22234 std::tie(Res,Chain) = LowerCallTo(CLI);
22235 if (IsStrict)
22236 Res = DAG.getMergeValues({Res, Chain}, DL);
22237
22238 return Res;
22239 }
22240
22241 In = DAG.getBitcast(MVT::i16, In);
22242 SDValue Res;
22243 if (IsStrict) {
22244 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
22245 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
22246 DAG.getVectorIdxConstant(0, DL));
22247 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
22248 {Chain, In});
22249 Chain = Res.getValue(1);
22250 } else {
22251 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
22252 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
22253 DAG.getUNDEF(MVT::v4i32), In,
22254 DAG.getVectorIdxConstant(0, DL));
22255 In = DAG.getBitcast(MVT::v8i16, In);
22256 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
22257 DAG.getTargetConstant(4, DL, MVT::i32));
22258 }
22259 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
22260 DAG.getVectorIdxConstant(0, DL));
22261 if (IsStrict)
22262 return DAG.getMergeValues({Res, Chain}, DL);
22263 return Res;
22264 }
22265
22266 if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16)
22267 return Op;
22268
22269 if (SVT.getVectorElementType() == MVT::f16) {
22270 if (Subtarget.hasFP16() && isTypeLegal(SVT))
22271 return Op;
22272 assert(Subtarget.hasF16C() && "Unexpected features!");
22273 if (SVT == MVT::v2f16)
22274 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
22275 DAG.getUNDEF(MVT::v2f16));
22276 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
22277 DAG.getUNDEF(MVT::v4f16));
22278 if (IsStrict)
22279 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22280 {Op->getOperand(0), Res});
22281 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22282 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
22283 return Op;
22284 }
22285
22286 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
22287
22288 SDValue Res =
22289 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
22290 if (IsStrict)
22291 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22292 {Op->getOperand(0), Res});
22293 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22294}
22295
22296SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
22297 bool IsStrict = Op->isStrictFPOpcode();
22298
22299 SDLoc DL(Op);
22300 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22301 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22302 MVT VT = Op.getSimpleValueType();
22303 MVT SVT = In.getSimpleValueType();
22304
22305 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
22306 return SDValue();
22307
22308 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
22309 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
22310 if (!Subtarget.getTargetTriple().isOSDarwin())
22311 return SDValue();
22312
22313 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
22314 TargetLowering::CallLoweringInfo CLI(DAG);
22315 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22316
22318 TargetLowering::ArgListEntry Entry(
22319 In, EVT(SVT).getTypeForEVT(*DAG.getContext()));
22320 Entry.IsSExt = false;
22321 Entry.IsZExt = true;
22322 Args.push_back(Entry);
22323
22325 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
22326 : RTLIB::FPROUND_F32_F16),
22328 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22329 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
22330 std::move(Args));
22331
22332 SDValue Res;
22333 std::tie(Res, Chain) = LowerCallTo(CLI);
22334
22335 Res = DAG.getBitcast(MVT::f16, Res);
22336
22337 if (IsStrict)
22338 Res = DAG.getMergeValues({Res, Chain}, DL);
22339
22340 return Res;
22341 }
22342
22343 if (VT.getScalarType() == MVT::bf16) {
22344 if (SVT.getScalarType() == MVT::f32 &&
22345 ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22346 Subtarget.hasAVXNECONVERT()))
22347 return Op;
22348 return SDValue();
22349 }
22350
22351 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
22352 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
22353 return SDValue();
22354
22355 if (VT.isVector())
22356 return Op;
22357
22358 SDValue Res;
22360 MVT::i32);
22361 if (IsStrict) {
22362 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
22363 DAG.getConstantFP(0, DL, MVT::v4f32), In,
22364 DAG.getVectorIdxConstant(0, DL));
22365 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
22366 {Chain, Res, Rnd});
22367 Chain = Res.getValue(1);
22368 } else {
22369 // FIXME: Should we use zeros for upper elements for non-strict?
22370 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
22371 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
22372 }
22373
22374 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22375 DAG.getVectorIdxConstant(0, DL));
22376 Res = DAG.getBitcast(MVT::f16, Res);
22377
22378 if (IsStrict)
22379 return DAG.getMergeValues({Res, Chain}, DL);
22380
22381 return Res;
22382 }
22383
22384 return Op;
22385}
22386
22388 bool IsStrict = Op->isStrictFPOpcode();
22389 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22390 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
22391 "Unexpected VT!");
22392
22393 SDLoc dl(Op);
22394 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
22395 DAG.getConstant(0, dl, MVT::v8i16), Src,
22396 DAG.getVectorIdxConstant(0, dl));
22397
22398 SDValue Chain;
22399 if (IsStrict) {
22400 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
22401 {Op.getOperand(0), Res});
22402 Chain = Res.getValue(1);
22403 } else {
22404 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
22405 }
22406
22407 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
22408 DAG.getVectorIdxConstant(0, dl));
22409
22410 if (IsStrict)
22411 return DAG.getMergeValues({Res, Chain}, dl);
22412
22413 return Res;
22414}
22415
22417 bool IsStrict = Op->isStrictFPOpcode();
22418 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22419 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
22420 "Unexpected VT!");
22421
22422 SDLoc dl(Op);
22423 SDValue Res, Chain;
22424 if (IsStrict) {
22425 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
22426 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
22427 DAG.getVectorIdxConstant(0, dl));
22428 Res = DAG.getNode(
22429 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
22430 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
22431 Chain = Res.getValue(1);
22432 } else {
22433 // FIXME: Should we use zeros for upper elements for non-strict?
22434 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
22435 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
22436 DAG.getTargetConstant(4, dl, MVT::i32));
22437 }
22438
22439 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
22440 DAG.getVectorIdxConstant(0, dl));
22441
22442 if (IsStrict)
22443 return DAG.getMergeValues({Res, Chain}, dl);
22444
22445 return Res;
22446}
22447
22448SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
22449 SelectionDAG &DAG) const {
22450 SDLoc DL(Op);
22451
22452 MVT SVT = Op.getOperand(0).getSimpleValueType();
22453 if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22454 Subtarget.hasAVXNECONVERT())) {
22455 SDValue Res;
22456 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));
22457 Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);
22458 Res = DAG.getBitcast(MVT::v8i16, Res);
22459 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22460 DAG.getVectorIdxConstant(0, DL));
22461 }
22462
22463 MakeLibCallOptions CallOptions;
22464 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
22465 SDValue Res =
22466 makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
22467 return DAG.getBitcast(MVT::i16, Res);
22468}
22469
22470/// Depending on uarch and/or optimizing for size, we might prefer to use a
22471/// vector operation in place of the typical scalar operation.
22473 SelectionDAG &DAG,
22474 const X86Subtarget &Subtarget) {
22475 // If both operands have other uses, this is probably not profitable.
22476 SDValue LHS = Op.getOperand(0);
22477 SDValue RHS = Op.getOperand(1);
22478 if (!LHS.hasOneUse() && !RHS.hasOneUse())
22479 return Op;
22480
22481 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
22482 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
22483 if (IsFP && !Subtarget.hasSSE3())
22484 return Op;
22485 if (!IsFP && !Subtarget.hasSSSE3())
22486 return Op;
22487
22488 // Extract from a common vector.
22489 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22490 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22491 LHS.getOperand(0) != RHS.getOperand(0) ||
22492 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
22493 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
22494 !shouldUseHorizontalOp(true, DAG, Subtarget))
22495 return Op;
22496
22497 // Allow commuted 'hadd' ops.
22498 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
22499 unsigned HOpcode;
22500 switch (Op.getOpcode()) {
22501 // clang-format off
22502 case ISD::ADD: HOpcode = X86ISD::HADD; break;
22503 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
22504 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
22505 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
22506 default:
22507 llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
22508 // clang-format on
22509 }
22510 unsigned LExtIndex = LHS.getConstantOperandVal(1);
22511 unsigned RExtIndex = RHS.getConstantOperandVal(1);
22512 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
22513 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
22514 std::swap(LExtIndex, RExtIndex);
22515
22516 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
22517 return Op;
22518
22519 SDValue X = LHS.getOperand(0);
22520 EVT VecVT = X.getValueType();
22521 unsigned BitWidth = VecVT.getSizeInBits();
22522 unsigned NumLanes = BitWidth / 128;
22523 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
22524 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
22525 "Not expecting illegal vector widths here");
22526
22527 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
22528 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
22529 if (BitWidth == 256 || BitWidth == 512) {
22530 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
22531 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
22532 LExtIndex %= NumEltsPerLane;
22533 }
22534
22535 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
22536 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
22537 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
22538 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
22539 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
22540 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
22541 DAG.getVectorIdxConstant(LExtIndex / 2, DL));
22542}
22543
22544/// Depending on uarch and/or optimizing for size, we might prefer to use a
22545/// vector operation in place of the typical scalar operation.
22546SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
22547 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
22548 "Only expecting float/double");
22549 return lowerAddSubToHorizontalOp(Op, SDLoc(Op), DAG, Subtarget);
22550}
22551
22552/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
22553/// This mode isn't supported in hardware on X86. But as long as we aren't
22554/// compiling with trapping math, we can emulate this with
22555/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
22557 SDValue N0 = Op.getOperand(0);
22558 SDLoc dl(Op);
22559 MVT VT = Op.getSimpleValueType();
22560
22561 // N0 += copysign(nextafter(0.5, 0.0), N0)
22562 const fltSemantics &Sem = VT.getFltSemantics();
22563 bool Ignored;
22564 APFloat Point5Pred = APFloat(0.5f);
22565 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22566 Point5Pred.next(/*nextDown*/true);
22567
22568 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22569 DAG.getConstantFP(Point5Pred, dl, VT), N0);
22570 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22571
22572 // Truncate the result to remove fraction.
22573 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22574}
22575
22576/// The only differences between FABS and FNEG are the mask and the logic op.
22577/// FNEG also has a folding opportunity for FNEG(FABS(x)).
22579 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
22580 "Wrong opcode for lowering FABS or FNEG.");
22581
22582 bool IsFABS = (Op.getOpcode() == ISD::FABS);
22583
22584 // If this is a FABS and it has an FNEG user, bail out to fold the combination
22585 // into an FNABS. We'll lower the FABS after that if it is still in use.
22586 if (IsFABS)
22587 for (SDNode *User : Op->users())
22588 if (User->getOpcode() == ISD::FNEG)
22589 return Op;
22590
22591 SDLoc dl(Op);
22592 MVT VT = Op.getSimpleValueType();
22593
22594 bool IsF128 = (VT == MVT::f128);
22595 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22597 "Unexpected type in LowerFABSorFNEG");
22598
22599 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to
22600 // decide if we should generate a 16-byte constant mask when we only need 4 or
22601 // 8 bytes for the scalar case.
22602
22603 // There are no scalar bitwise logical SSE/AVX instructions, so we
22604 // generate a 16-byte vector constant and logic op even for the scalar case.
22605 // Using a 16-byte mask allows folding the load of the mask with
22606 // the logic op, so it can save (~4 bytes) on code size.
22607 bool IsFakeVector = !VT.isVector() && !IsF128;
22608 MVT LogicVT = VT;
22609 if (IsFakeVector)
22610 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22611 : (VT == MVT::f32) ? MVT::v4f32
22612 : MVT::v8f16;
22613
22614 unsigned EltBits = VT.getScalarSizeInBits();
22615 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22616 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22617 APInt::getSignMask(EltBits);
22618 const fltSemantics &Sem = VT.getFltSemantics();
22619 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22620
22621 SDValue Op0 = Op.getOperand(0);
22622 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22623 unsigned LogicOp = IsFABS ? X86ISD::FAND :
22624 IsFNABS ? X86ISD::FOR :
22626 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22627
22628 if (VT.isVector() || IsF128)
22629 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22630
22631 // For the scalar case extend to a 128-bit vector, perform the logic op,
22632 // and extract the scalar result back out.
22633 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22634 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22635 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22636 DAG.getVectorIdxConstant(0, dl));
22637}
22638
22640 SDValue Mag = Op.getOperand(0);
22641 SDValue Sign = Op.getOperand(1);
22642 SDLoc dl(Op);
22643
22644 // If the sign operand is smaller, extend it first.
22645 MVT VT = Op.getSimpleValueType();
22646 if (Sign.getSimpleValueType().bitsLT(VT))
22647 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22648
22649 // And if it is bigger, shrink it first.
22650 if (Sign.getSimpleValueType().bitsGT(VT))
22651 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
22652 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
22653
22654 // At this point the operands and the result should have the same
22655 // type, and that won't be f80 since that is not custom lowered.
22656 bool IsF128 = (VT == MVT::f128);
22657 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22659 "Unexpected type in LowerFCOPYSIGN");
22660
22661 const fltSemantics &Sem = VT.getFltSemantics();
22662
22663 // Perform all scalar logic operations as 16-byte vectors because there are no
22664 // scalar FP logic instructions in SSE.
22665 // TODO: This isn't necessary. If we used scalar types, we might avoid some
22666 // unnecessary splats, but we might miss load folding opportunities. Should
22667 // this decision be based on OptimizeForSize?
22668 bool IsFakeVector = !VT.isVector() && !IsF128;
22669 MVT LogicVT = VT;
22670 if (IsFakeVector)
22671 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22672 : (VT == MVT::f32) ? MVT::v4f32
22673 : MVT::v8f16;
22674
22675 // The mask constants are automatically splatted for vector types.
22676 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22677 SDValue SignMask = DAG.getConstantFP(
22678 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22679 SDValue MagMask = DAG.getConstantFP(
22680 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22681
22682 // First, clear all bits but the sign bit from the second operand (sign).
22683 if (IsFakeVector)
22684 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22685 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22686
22687 // Next, clear the sign bit from the first operand (magnitude).
22688 // TODO: If we had general constant folding for FP logic ops, this check
22689 // wouldn't be necessary.
22690 SDValue MagBits;
22691 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22692 APFloat APF = Op0CN->getValueAPF();
22693 APF.clearSign();
22694 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22695 } else {
22696 // If the magnitude operand wasn't a constant, we need to AND out the sign.
22697 if (IsFakeVector)
22698 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22699 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22700 }
22701
22702 // OR the magnitude value with the sign bit.
22703 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22704 return !IsFakeVector ? Or
22705 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22706 DAG.getVectorIdxConstant(0, dl));
22707}
22708
22710 SDValue N0 = Op.getOperand(0);
22711 SDLoc dl(Op);
22712 MVT VT = Op.getSimpleValueType();
22713
22714 MVT OpVT = N0.getSimpleValueType();
22715 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
22716 "Unexpected type for FGETSIGN");
22717
22718 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22719 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22720 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22721 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22722 Res = DAG.getZExtOrTrunc(Res, dl, VT);
22723 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22724 return Res;
22725}
22726
22727/// Helper for attempting to create a X86ISD::BT node.
22728static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
22729 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
22730 // instruction. Since the shift amount is in-range-or-undefined, we know
22731 // that doing a bittest on the i32 value is ok. We extend to i32 because
22732 // the encoding for the i16 version is larger than the i32 version.
22733 // Also promote i16 to i32 for performance / code size reason.
22734 if (Src.getValueType().getScalarSizeInBits() < 32)
22735 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
22736
22737 // No legal type found, give up.
22738 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
22739 return SDValue();
22740
22741 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22742 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22743 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22744 // known to be zero.
22745 if (Src.getValueType() == MVT::i64 &&
22746 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22747 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
22748
22749 // If the operand types disagree, extend the shift amount to match. Since
22750 // BT ignores high bits (like shifts) we can use anyextend.
22751 if (Src.getValueType() != BitNo.getValueType()) {
22752 // Peek through a mask/modulo operation.
22753 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
22754 // we probably need a better IsDesirableToPromoteOp to handle this as well.
22755 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
22756 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
22757 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22758 BitNo.getOperand(0)),
22759 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22760 BitNo.getOperand(1)));
22761 else
22762 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
22763 }
22764
22765 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
22766}
22767
22768/// Helper for creating a X86ISD::SETCC node.
22770 SelectionDAG &DAG) {
22771 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22772 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22773}
22774
22775/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
22776/// recognizable memcmp expansion.
22777static bool isOrXorXorTree(SDValue X, bool Root = true) {
22778 if (X.getOpcode() == ISD::OR)
22779 return isOrXorXorTree(X.getOperand(0), false) &&
22780 isOrXorXorTree(X.getOperand(1), false);
22781 if (Root)
22782 return false;
22783 return X.getOpcode() == ISD::XOR;
22784}
22785
22786/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
22787/// expansion.
22788template <typename F>
22790 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
22791 SDValue Op0 = X.getOperand(0);
22792 SDValue Op1 = X.getOperand(1);
22793 if (X.getOpcode() == ISD::OR) {
22794 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22795 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22796 if (VecVT != CmpVT)
22797 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
22798 if (HasPT)
22799 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
22800 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
22801 }
22802 if (X.getOpcode() == ISD::XOR) {
22803 SDValue A = SToV(Op0);
22804 SDValue B = SToV(Op1);
22805 if (VecVT != CmpVT)
22806 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
22807 if (HasPT)
22808 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
22809 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
22810 }
22811 llvm_unreachable("Impossible");
22812}
22813
22814/// Try to map a 128-bit or larger integer comparison to vector instructions
22815/// before type legalization splits it up into chunks.
22817 ISD::CondCode CC,
22818 const SDLoc &DL,
22819 SelectionDAG &DAG,
22820 const X86Subtarget &Subtarget) {
22821 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
22822
22823 // We're looking for an oversized integer equality comparison.
22824 EVT OpVT = X.getValueType();
22825 unsigned OpSize = OpVT.getSizeInBits();
22826 if (!OpVT.isScalarInteger() || OpSize < 128)
22827 return SDValue();
22828
22829 // Ignore a comparison with zero because that gets special treatment in
22830 // EmitTest(). But make an exception for the special case of a pair of
22831 // logically-combined vector-sized operands compared to zero. This pattern may
22832 // be generated by the memcmp expansion pass with oversized integer compares
22833 // (see PR33325).
22834 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
22835 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
22836 return SDValue();
22837
22838 // Don't perform this combine if constructing the vector will be expensive.
22839 auto IsVectorBitCastCheap = [](SDValue X) {
22841 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
22842 X.getOpcode() == ISD::LOAD;
22843 };
22844 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
22845 !IsOrXorXorTreeCCZero)
22846 return SDValue();
22847
22848 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22849 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22850 // Otherwise use PCMPEQ (plus AND) and mask testing.
22851 bool NoImplicitFloatOps =
22853 Attribute::NoImplicitFloat);
22854 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
22855 ((OpSize == 128 && Subtarget.hasSSE2()) ||
22856 (OpSize == 256 && Subtarget.hasAVX()) ||
22857 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
22858 bool HasPT = Subtarget.hasSSE41();
22859
22860 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
22861 // vector registers are essentially free. (Technically, widening registers
22862 // prevents load folding, but the tradeoff is worth it.)
22863 bool PreferKOT = Subtarget.preferMaskRegisters();
22864 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
22865
22866 EVT VecVT = MVT::v16i8;
22867 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
22868 if (OpSize == 256) {
22869 VecVT = MVT::v32i8;
22870 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
22871 }
22872 EVT CastVT = VecVT;
22873 bool NeedsAVX512FCast = false;
22874 if (OpSize == 512 || NeedZExt) {
22875 if (Subtarget.hasBWI()) {
22876 VecVT = MVT::v64i8;
22877 CmpVT = MVT::v64i1;
22878 if (OpSize == 512)
22879 CastVT = VecVT;
22880 } else {
22881 VecVT = MVT::v16i32;
22882 CmpVT = MVT::v16i1;
22883 CastVT = OpSize == 512 ? VecVT
22884 : OpSize == 256 ? MVT::v8i32
22885 : MVT::v4i32;
22886 NeedsAVX512FCast = true;
22887 }
22888 }
22889
22890 auto ScalarToVector = [&](SDValue X) -> SDValue {
22891 bool TmpZext = false;
22892 EVT TmpCastVT = CastVT;
22893 if (X.getOpcode() == ISD::ZERO_EXTEND) {
22894 SDValue OrigX = X.getOperand(0);
22895 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
22896 if (OrigSize < OpSize) {
22897 if (OrigSize == 128) {
22898 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
22899 X = OrigX;
22900 TmpZext = true;
22901 } else if (OrigSize == 256) {
22902 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
22903 X = OrigX;
22904 TmpZext = true;
22905 }
22906 }
22907 }
22908 X = DAG.getBitcast(TmpCastVT, X);
22909 if (!NeedZExt && !TmpZext)
22910 return X;
22911 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
22912 DAG.getConstant(0, DL, VecVT), X,
22913 DAG.getVectorIdxConstant(0, DL));
22914 };
22915
22916 SDValue Cmp;
22917 if (IsOrXorXorTreeCCZero) {
22918 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22919 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
22920 // Use 2 vector equality compares and 'and' the results before doing a
22921 // MOVMSK.
22922 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
22923 } else {
22924 SDValue VecX = ScalarToVector(X);
22925 SDValue VecY = ScalarToVector(Y);
22926 if (VecVT != CmpVT) {
22927 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
22928 } else if (HasPT) {
22929 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
22930 } else {
22931 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
22932 }
22933 }
22934 // AVX512 should emit a setcc that will lower to kortest.
22935 if (VecVT != CmpVT) {
22936 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
22937 : CmpVT == MVT::v32i1 ? MVT::i32
22938 : MVT::i16;
22939 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
22940 DAG.getConstant(0, DL, KRegVT), CC);
22941 }
22942 if (HasPT) {
22943 SDValue BCCmp =
22944 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
22945 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
22947 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
22948 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
22949 }
22950 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
22951 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22952 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22953 assert(Cmp.getValueType() == MVT::v16i8 &&
22954 "Non 128-bit vector on pre-SSE41 target");
22955 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
22956 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
22957 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
22958 }
22959
22960 return SDValue();
22961}
22962
22963/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
22964/// style scalarized (associative) reduction patterns. Partial reductions
22965/// are supported when the pointer SrcMask is non-null.
22966/// TODO - move this to SelectionDAG?
22969 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22971 DenseMap<SDValue, APInt> SrcOpMap;
22972 EVT VT = MVT::Other;
22973
22974 // Recognize a special case where a vector is casted into wide integer to
22975 // test all 0s.
22976 assert(Op.getOpcode() == unsigned(BinOp) &&
22977 "Unexpected bit reduction opcode");
22978 Opnds.push_back(Op.getOperand(0));
22979 Opnds.push_back(Op.getOperand(1));
22980
22981 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22983 // BFS traverse all BinOp operands.
22984 if (I->getOpcode() == unsigned(BinOp)) {
22985 Opnds.push_back(I->getOperand(0));
22986 Opnds.push_back(I->getOperand(1));
22987 // Re-evaluate the number of nodes to be traversed.
22988 e += 2; // 2 more nodes (LHS and RHS) are pushed.
22989 continue;
22990 }
22991
22992 // Quit if a non-EXTRACT_VECTOR_ELT
22993 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22994 return false;
22995
22996 // Quit if without a constant index.
22997 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
22998 if (!Idx)
22999 return false;
23000
23001 SDValue Src = I->getOperand(0);
23002 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
23003 if (M == SrcOpMap.end()) {
23004 VT = Src.getValueType();
23005 // Quit if not the same type.
23006 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
23007 return false;
23008 unsigned NumElts = VT.getVectorNumElements();
23009 APInt EltCount = APInt::getZero(NumElts);
23010 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
23011 SrcOps.push_back(Src);
23012 }
23013
23014 // Quit if element already used.
23015 unsigned CIdx = Idx->getZExtValue();
23016 if (M->second[CIdx])
23017 return false;
23018 M->second.setBit(CIdx);
23019 }
23020
23021 if (SrcMask) {
23022 // Collect the source partial masks.
23023 for (SDValue &SrcOp : SrcOps)
23024 SrcMask->push_back(SrcOpMap[SrcOp]);
23025 } else {
23026 // Quit if not all elements are used.
23027 for (const auto &I : SrcOpMap)
23028 if (!I.second.isAllOnes())
23029 return false;
23030 }
23031
23032 return true;
23033}
23034
23035// Helper function for comparing all bits of two vectors.
23037 ISD::CondCode CC, const APInt &OriginalMask,
23038 const X86Subtarget &Subtarget,
23039 SelectionDAG &DAG, X86::CondCode &X86CC) {
23040 EVT VT = LHS.getValueType();
23041 unsigned ScalarSize = VT.getScalarSizeInBits();
23042 if (OriginalMask.getBitWidth() != ScalarSize) {
23043 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
23044 return SDValue();
23045 }
23046
23047 // Quit if not convertable to legal scalar or 128/256-bit vector.
23049 return SDValue();
23050
23051 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
23052 if (VT.isFloatingPoint())
23053 return SDValue();
23054
23055 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
23056 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
23057
23058 APInt Mask = OriginalMask;
23059
23060 auto MaskBits = [&](SDValue Src) {
23061 if (Mask.isAllOnes())
23062 return Src;
23063 EVT SrcVT = Src.getValueType();
23064 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
23065 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
23066 };
23067
23068 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
23069 if (VT.getSizeInBits() < 128) {
23070 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
23071 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
23072 if (IntVT != MVT::i64)
23073 return SDValue();
23074 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
23075 MVT::i32, MVT::i32);
23076 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
23077 MVT::i32, MVT::i32);
23078 SDValue Lo =
23079 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
23080 SDValue Hi =
23081 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
23082 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23083 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
23084 DAG.getConstant(0, DL, MVT::i32));
23085 }
23086 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23087 DAG.getBitcast(IntVT, MaskBits(LHS)),
23088 DAG.getBitcast(IntVT, MaskBits(RHS)));
23089 }
23090
23091 // Without PTEST, a masked v2i64 or-reduction is not faster than
23092 // scalarization.
23093 bool UseKORTEST = Subtarget.useAVX512Regs();
23094 bool UsePTEST = Subtarget.hasSSE41();
23095 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
23096 return SDValue();
23097
23098 // Split down to 128/256/512-bit vector.
23099 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
23100
23101 // If the input vector has vector elements wider than the target test size,
23102 // then cast to <X x i64> so it will safely split.
23103 if (ScalarSize > TestSize) {
23104 if (!Mask.isAllOnes())
23105 return SDValue();
23106 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
23107 LHS = DAG.getBitcast(VT, LHS);
23108 RHS = DAG.getBitcast(VT, RHS);
23109 Mask = APInt::getAllOnes(64);
23110 }
23111
23112 if (VT.getSizeInBits() > TestSize) {
23113 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
23114 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
23115 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
23116 while (VT.getSizeInBits() > TestSize) {
23117 auto Split = DAG.SplitVector(LHS, DL);
23118 VT = Split.first.getValueType();
23119 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23120 }
23121 RHS = DAG.getAllOnesConstant(DL, VT);
23122 } else if (!UsePTEST && !KnownRHS.isZero()) {
23123 // MOVMSK Special Case:
23124 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
23125 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
23126 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
23127 LHS = DAG.getBitcast(VT, MaskBits(LHS));
23128 RHS = DAG.getBitcast(VT, MaskBits(RHS));
23129 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
23130 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
23131 V = DAG.getSExtOrTrunc(V, DL, VT);
23132 while (VT.getSizeInBits() > TestSize) {
23133 auto Split = DAG.SplitVector(V, DL);
23134 VT = Split.first.getValueType();
23135 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23136 }
23137 V = DAG.getNOT(DL, V, VT);
23138 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23139 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23140 DAG.getConstant(0, DL, MVT::i32));
23141 } else {
23142 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
23143 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
23144 while (VT.getSizeInBits() > TestSize) {
23145 auto Split = DAG.SplitVector(V, DL);
23146 VT = Split.first.getValueType();
23147 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
23148 }
23149 LHS = V;
23150 RHS = DAG.getConstant(0, DL, VT);
23151 }
23152 }
23153
23154 if (UseKORTEST && VT.is512BitVector()) {
23155 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
23156 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
23157 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23158 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23159 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
23160 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
23161 }
23162
23163 if (UsePTEST) {
23164 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
23165 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23166 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23167 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
23168 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
23169 }
23170
23171 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
23172 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
23173 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
23174 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
23175 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
23176 V = DAG.getNOT(DL, V, MaskVT);
23177 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23178 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23179 DAG.getConstant(0, DL, MVT::i32));
23180}
23181
23182// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
23183// to CMP(MOVMSK(PCMPEQB(X,Y))).
23185 ISD::CondCode CC, const SDLoc &DL,
23186 const X86Subtarget &Subtarget,
23187 SelectionDAG &DAG,
23188 X86::CondCode &X86CC) {
23189 SDValue Op = OrigLHS;
23190
23191 bool CmpNull;
23192 APInt Mask;
23193 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
23194 CmpNull = isNullConstant(OrigRHS);
23195 if (!CmpNull && !isAllOnesConstant(OrigRHS))
23196 return SDValue();
23197
23198 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
23199 return SDValue();
23200
23201 // Check whether we're masking/truncating an OR-reduction result, in which
23202 // case track the masked bits.
23203 // TODO: Add CmpAllOnes support.
23204 Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
23205 if (CmpNull) {
23206 switch (Op.getOpcode()) {
23207 case ISD::TRUNCATE: {
23208 SDValue Src = Op.getOperand(0);
23209 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
23210 Op.getScalarValueSizeInBits());
23211 Op = Src;
23212 break;
23213 }
23214 case ISD::AND: {
23215 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
23216 Mask = Cst->getAPIntValue();
23217 Op = Op.getOperand(0);
23218 }
23219 break;
23220 }
23221 }
23222 }
23223 } else if (CC == ISD::SETGT && isAllOnesConstant(OrigRHS)) {
23224 CC = ISD::SETEQ;
23225 CmpNull = true;
23226 Mask = APInt::getSignMask(Op.getScalarValueSizeInBits());
23227 } else {
23228 return SDValue();
23229 }
23230
23231 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
23232
23233 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
23234 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
23236 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
23237 EVT VT = VecIns[0].getValueType();
23238 assert(llvm::all_of(VecIns,
23239 [VT](SDValue V) { return VT == V.getValueType(); }) &&
23240 "Reduction source vector mismatch");
23241
23242 // Quit if not splittable to scalar/128/256/512-bit vector.
23244 return SDValue();
23245
23246 // If more than one full vector is evaluated, AND/OR them first before
23247 // PTEST.
23248 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
23249 Slot += 2, e += 1) {
23250 // Each iteration will AND/OR 2 nodes and append the result until there is
23251 // only 1 node left, i.e. the final value of all vectors.
23252 SDValue LHS = VecIns[Slot];
23253 SDValue RHS = VecIns[Slot + 1];
23254 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
23255 }
23256
23257 return LowerVectorAllEqual(DL, VecIns.back(),
23258 CmpNull ? DAG.getConstant(0, DL, VT)
23259 : DAG.getAllOnesConstant(DL, VT),
23260 CC, Mask, Subtarget, DAG, X86CC);
23261 }
23262
23263 // Match icmp(reduce_or(X),0) anyof reduction patterns.
23264 // Match icmp(reduce_and(X),-1) allof reduction patterns.
23265 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23266 ISD::NodeType BinOp;
23267 if (SDValue Match =
23268 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
23269 EVT MatchVT = Match.getValueType();
23270 return LowerVectorAllEqual(DL, Match,
23271 CmpNull ? DAG.getConstant(0, DL, MatchVT)
23272 : DAG.getAllOnesConstant(DL, MatchVT),
23273 CC, Mask, Subtarget, DAG, X86CC);
23274 }
23275 }
23276
23277 if (Mask.isAllOnes()) {
23278 assert(!Op.getValueType().isVector() &&
23279 "Illegal vector type for reduction pattern");
23281 if (Src.getValueType().isFixedLengthVector() &&
23282 Src.getValueType().getScalarType() == MVT::i1) {
23283 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
23284 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
23285 if (Src.getOpcode() == ISD::SETCC) {
23286 SDValue LHS = Src.getOperand(0);
23287 SDValue RHS = Src.getOperand(1);
23288 EVT LHSVT = LHS.getValueType();
23289 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
23290 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
23292 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
23293 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
23294 X86CC);
23295 }
23296 }
23297 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
23298 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
23299 // Peek through truncation, mask the LSB and compare against zero/LSB.
23300 if (Src.getOpcode() == ISD::TRUNCATE) {
23301 SDValue Inner = Src.getOperand(0);
23302 EVT InnerVT = Inner.getValueType();
23304 unsigned BW = InnerVT.getScalarSizeInBits();
23305 APInt SrcMask = APInt(BW, 1);
23306 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
23307 return LowerVectorAllEqual(DL, Inner,
23308 DAG.getConstant(Cmp, DL, InnerVT), CC,
23309 SrcMask, Subtarget, DAG, X86CC);
23310 }
23311 }
23312 }
23313 }
23314
23315 return SDValue();
23316}
23317
23318/// return true if \c Op has a use that doesn't just read flags.
23320 for (SDUse &Use : Op->uses()) {
23321 SDNode *User = Use.getUser();
23322 unsigned UOpNo = Use.getOperandNo();
23323 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
23324 // Look past truncate.
23325 UOpNo = User->use_begin()->getOperandNo();
23326 User = User->use_begin()->getUser();
23327 }
23328
23329 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
23330 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
23331 return true;
23332 }
23333 return false;
23334}
23335
23336// Transform to an x86-specific ALU node with flags if there is a chance of
23337// using an RMW op or only the flags are used. Otherwise, leave
23338// the node alone and emit a 'cmp' or 'test' instruction.
23340 for (SDNode *U : Op->users())
23341 if (U->getOpcode() != ISD::CopyToReg &&
23342 U->getOpcode() != ISD::SETCC &&
23343 U->getOpcode() != ISD::STORE)
23344 return false;
23345
23346 return true;
23347}
23348
23349/// Emit nodes that will be selected as "test Op0,Op0", or something
23350/// equivalent.
23352 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
23353 // CF and OF aren't always set the way we want. Determine which
23354 // of these we need.
23355 bool NeedCF = false;
23356 bool NeedOF = false;
23357 switch (X86CC) {
23358 default: break;
23359 case X86::COND_A: case X86::COND_AE:
23360 case X86::COND_B: case X86::COND_BE:
23361 NeedCF = true;
23362 break;
23363 case X86::COND_G: case X86::COND_GE:
23364 case X86::COND_L: case X86::COND_LE:
23365 case X86::COND_O: case X86::COND_NO: {
23366 // Check if we really need to set the
23367 // Overflow flag. If NoSignedWrap is present
23368 // that is not actually needed.
23369 switch (Op->getOpcode()) {
23370 case ISD::ADD:
23371 case ISD::SUB:
23372 case ISD::MUL:
23373 case ISD::SHL:
23374 if (Op.getNode()->getFlags().hasNoSignedWrap())
23375 break;
23376 [[fallthrough]];
23377 default:
23378 NeedOF = true;
23379 break;
23380 }
23381 break;
23382 }
23383 }
23384 // See if we can use the EFLAGS value from the operand instead of
23385 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
23386 // we prove that the arithmetic won't overflow, we can't use OF or CF.
23387 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
23388 // Emit a CMP with 0, which is the TEST pattern.
23389 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23390 DAG.getConstant(0, dl, Op.getValueType()));
23391 }
23392 unsigned Opcode = 0;
23393 unsigned NumOperands = 0;
23394
23395 SDValue ArithOp = Op;
23396
23397 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
23398 // which may be the result of a CAST. We use the variable 'Op', which is the
23399 // non-casted variable when we check for possible users.
23400 switch (ArithOp.getOpcode()) {
23401 case ISD::AND:
23402 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
23403 // because a TEST instruction will be better.
23404 if (!hasNonFlagsUse(Op))
23405 break;
23406
23407 [[fallthrough]];
23408 case ISD::ADD:
23409 case ISD::SUB:
23410 case ISD::OR:
23411 case ISD::XOR:
23413 break;
23414
23415 // Otherwise use a regular EFLAGS-setting instruction.
23416 switch (ArithOp.getOpcode()) {
23417 // clang-format off
23418 default: llvm_unreachable("unexpected operator!");
23419 case ISD::ADD: Opcode = X86ISD::ADD; break;
23420 case ISD::SUB: Opcode = X86ISD::SUB; break;
23421 case ISD::XOR: Opcode = X86ISD::XOR; break;
23422 case ISD::AND: Opcode = X86ISD::AND; break;
23423 case ISD::OR: Opcode = X86ISD::OR; break;
23424 // clang-format on
23425 }
23426
23427 NumOperands = 2;
23428 break;
23429 case X86ISD::ADD:
23430 case X86ISD::SUB:
23431 case X86ISD::OR:
23432 case X86ISD::XOR:
23433 case X86ISD::AND:
23434 return SDValue(Op.getNode(), 1);
23435 case ISD::SSUBO:
23436 case ISD::USUBO: {
23437 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
23438 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23439 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
23440 Op->getOperand(1)).getValue(1);
23441 }
23442 default:
23443 break;
23444 }
23445
23446 if (Opcode == 0) {
23447 // Emit a CMP with 0, which is the TEST pattern.
23448 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23449 DAG.getConstant(0, dl, Op.getValueType()));
23450 }
23451 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23452 SmallVector<SDValue, 4> Ops(Op->ops().take_front(NumOperands));
23453
23454 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
23455 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
23456 return SDValue(New.getNode(), 1);
23457}
23458
23459/// Emit nodes that will be selected as "cmp Op0,Op1", or something
23460/// equivalent.
23462 const SDLoc &dl, SelectionDAG &DAG,
23463 const X86Subtarget &Subtarget) {
23464 if (isNullConstant(Op1))
23465 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
23466
23467 EVT CmpVT = Op0.getValueType();
23468
23469 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
23470 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
23471
23472 // Only promote the compare up to I32 if it is a 16 bit operation
23473 // with an immediate. 16 bit immediates are to be avoided unless the target
23474 // isn't slowed down by length changing prefixes, we're optimizing for
23475 // codesize or the comparison is with a folded load.
23476 if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&
23477 !X86::mayFoldLoad(Op0, Subtarget) && !X86::mayFoldLoad(Op1, Subtarget) &&
23479 auto *COp0 = dyn_cast<ConstantSDNode>(Op0);
23480 auto *COp1 = dyn_cast<ConstantSDNode>(Op1);
23481 // Don't do this if the immediate can fit in 8-bits.
23482 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
23483 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
23484 unsigned ExtendOp =
23486 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
23487 // For equality comparisons try to use SIGN_EXTEND if the input was
23488 // truncate from something with enough sign bits.
23489 if (Op0.getOpcode() == ISD::TRUNCATE) {
23490 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
23491 ExtendOp = ISD::SIGN_EXTEND;
23492 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
23493 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
23494 ExtendOp = ISD::SIGN_EXTEND;
23495 }
23496 }
23497
23498 CmpVT = MVT::i32;
23499 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
23500 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
23501 }
23502 }
23503
23504 // Try to shrink i64 compares if the input has enough zero bits.
23505 if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) &&
23506 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23507 DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) &&
23508 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
23509 CmpVT = MVT::i32;
23510 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23511 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23512 }
23513
23514 // Try to shrink all i64 compares if the inputs are representable as signed
23515 // i32.
23516 if (CmpVT == MVT::i64 &&
23517 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23518 DAG.ComputeNumSignBits(Op1) > 32 && DAG.ComputeNumSignBits(Op0) > 32) {
23519 CmpVT = MVT::i32;
23520 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23521 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23522 }
23523
23524 // 0-x == y --> x+y == 0
23525 // 0-x != y --> x+y != 0
23526 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
23527 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23528 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23529 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
23530 return Add.getValue(1);
23531 }
23532
23533 // x == 0-y --> x+y == 0
23534 // x != 0-y --> x+y != 0
23535 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
23536 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23537 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23538 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
23539 return Add.getValue(1);
23540 }
23541
23542 // If we already have an XOR of the ops, use that to check for equality.
23543 // Else use SUB instead of CMP to enable CSE between SUB and CMP.
23544 unsigned X86Opc = X86ISD::SUB;
23545 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
23546 (DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op0, Op1}) ||
23547 DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op1, Op0})))
23548 X86Opc = X86ISD::XOR;
23549
23550 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23551 SDValue CmpOp = DAG.getNode(X86Opc, dl, VTs, Op0, Op1);
23552 return CmpOp.getValue(1);
23553}
23554
23559
23560bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(
23561 SDNode *N, SDValue, SDValue IntPow2) const {
23562 if (N->getOpcode() == ISD::FDIV)
23563 return true;
23564
23565 EVT FPVT = N->getValueType(0);
23566 EVT IntVT = IntPow2.getValueType();
23567
23568 // This indicates a non-free bitcast.
23569 // TODO: This is probably overly conservative as we will need to scale the
23570 // integer vector anyways for the int->fp cast.
23571 if (FPVT.isVector() &&
23572 FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())
23573 return false;
23574
23575 return true;
23576}
23577
23578/// Check if replacement of SQRT with RSQRT should be disabled.
23579bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
23580 EVT VT = Op.getValueType();
23581
23582 // We don't need to replace SQRT with RSQRT for half type.
23583 if (VT.getScalarType() == MVT::f16)
23584 return true;
23585
23586 // We never want to use both SQRT and RSQRT instructions for the same input.
23587 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
23588 return false;
23589
23590 if (VT.isVector())
23591 return Subtarget.hasFastVectorFSQRT();
23592 return Subtarget.hasFastScalarFSQRT();
23593}
23594
23595/// The minimum architected relative accuracy is 2^-12. We need one
23596/// Newton-Raphson step to have a good float result (24 bits of precision).
23597SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
23598 SelectionDAG &DAG, int Enabled,
23599 int &RefinementSteps,
23600 bool &UseOneConstNR,
23601 bool Reciprocal) const {
23602 SDLoc DL(Op);
23603 EVT VT = Op.getValueType();
23604
23605 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
23606 // It is likely not profitable to do this for f64 because a double-precision
23607 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
23608 // instructions: convert to single, rsqrtss, convert back to double, refine
23609 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
23610 // along with FMA, this could be a throughput win.
23611 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
23612 // after legalize types.
23613 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23614 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
23615 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
23616 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23617 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23618 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23619 RefinementSteps = 1;
23620
23621 UseOneConstNR = false;
23622 // There is no FSQRT for 512-bits, but there is RSQRT14.
23623 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
23624 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
23625 if (RefinementSteps == 0 && !Reciprocal)
23626 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
23627 return Estimate;
23628 }
23629
23630 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23631 Subtarget.hasFP16()) {
23632 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
23633 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23634 RefinementSteps = 0;
23635
23636 if (VT == MVT::f16) {
23638 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23639 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23640 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
23641 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23642 }
23643
23644 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
23645 }
23646 return SDValue();
23647}
23648
23649/// The minimum architected relative accuracy is 2^-12. We need one
23650/// Newton-Raphson step to have a good float result (24 bits of precision).
23651SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
23652 int Enabled,
23653 int &RefinementSteps) const {
23654 SDLoc DL(Op);
23655 EVT VT = Op.getValueType();
23656
23657 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
23658 // It is likely not profitable to do this for f64 because a double-precision
23659 // reciprocal estimate with refinement on x86 prior to FMA requires
23660 // 15 instructions: convert to single, rcpss, convert back to double, refine
23661 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
23662 // along with FMA, this could be a throughput win.
23663
23664 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23665 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
23666 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23667 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23668 // Enable estimate codegen with 1 refinement step for vector division.
23669 // Scalar division estimates are disabled because they break too much
23670 // real-world code. These defaults are intended to match GCC behavior.
23671 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
23672 return SDValue();
23673
23674 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23675 RefinementSteps = 1;
23676
23677 // There is no FSQRT for 512-bits, but there is RCP14.
23678 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
23679 return DAG.getNode(Opcode, DL, VT, Op);
23680 }
23681
23682 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23683 Subtarget.hasFP16()) {
23684 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23685 RefinementSteps = 0;
23686
23687 if (VT == MVT::f16) {
23689 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23690 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23691 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
23692 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23693 }
23694
23695 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
23696 }
23697 return SDValue();
23698}
23699
23700/// If we have at least two divisions that use the same divisor, convert to
23701/// multiplication by a reciprocal. This may need to be adjusted for a given
23702/// CPU if a division's cost is not at least twice the cost of a multiplication.
23703/// This is because we still need one division to calculate the reciprocal and
23704/// then we need two multiplies by that reciprocal as replacements for the
23705/// original divisions.
23707 return 2;
23708}
23709
23710SDValue
23711X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
23712 SelectionDAG &DAG,
23713 SmallVectorImpl<SDNode *> &Created) const {
23714 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
23715 if (isIntDivCheap(N->getValueType(0), Attr))
23716 return SDValue(N,0); // Lower SDIV as SDIV
23717
23718 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
23719 "Unexpected divisor!");
23720
23721 // Only perform this transform if CMOV is supported otherwise the select
23722 // below will become a branch.
23723 if (!Subtarget.canUseCMOV())
23724 return SDValue();
23725
23726 // fold (sdiv X, pow2)
23727 EVT VT = N->getValueType(0);
23728 // FIXME: Support i8.
23729 if (VT != MVT::i16 && VT != MVT::i32 &&
23730 !(Subtarget.is64Bit() && VT == MVT::i64))
23731 return SDValue();
23732
23733 // If the divisor is 2 or -2, the default expansion is better.
23734 if (Divisor == 2 ||
23735 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
23736 return SDValue();
23737
23738 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
23739}
23740
23741/// Result of 'and' is compared against zero. Change to a BT node if possible.
23742/// Returns the BT node and the condition code needed to use it.
23744 SelectionDAG &DAG, X86::CondCode &X86CC) {
23745 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
23746 SDValue Op0 = And.getOperand(0);
23747 SDValue Op1 = And.getOperand(1);
23748 if (Op0.getOpcode() == ISD::TRUNCATE)
23749 Op0 = Op0.getOperand(0);
23750 if (Op1.getOpcode() == ISD::TRUNCATE)
23751 Op1 = Op1.getOperand(0);
23752
23753 SDValue Src, BitNo;
23754 if (Op1.getOpcode() == ISD::SHL)
23755 std::swap(Op0, Op1);
23756 if (Op0.getOpcode() == ISD::SHL) {
23757 if (isOneConstant(Op0.getOperand(0))) {
23758 // If we looked past a truncate, check that it's only truncating away
23759 // known zeros.
23760 unsigned BitWidth = Op0.getValueSizeInBits();
23761 unsigned AndBitWidth = And.getValueSizeInBits();
23762 if (BitWidth > AndBitWidth) {
23763 KnownBits Known = DAG.computeKnownBits(Op0);
23764 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23765 return SDValue();
23766 }
23767 Src = Op1;
23768 BitNo = Op0.getOperand(1);
23769 }
23770 } else if (Op1.getOpcode() == ISD::Constant) {
23771 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
23772 uint64_t AndRHSVal = AndRHS->getZExtValue();
23773 SDValue AndLHS = Op0;
23774
23775 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
23776 Src = AndLHS.getOperand(0);
23777 BitNo = AndLHS.getOperand(1);
23778 } else {
23779 // Use BT if the immediate can't be encoded in a TEST instruction or we
23780 // are optimizing for size and the immedaite won't fit in a byte.
23781 bool OptForSize = DAG.shouldOptForSize();
23782 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
23783 isPowerOf2_64(AndRHSVal)) {
23784 Src = AndLHS;
23785 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
23786 Src.getValueType());
23787 }
23788 }
23789 }
23790
23791 // No patterns found, give up.
23792 if (!Src.getNode())
23793 return SDValue();
23794
23795 // Remove any bit flip.
23796 if (isBitwiseNot(Src)) {
23797 Src = Src.getOperand(0);
23798 CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
23799 }
23800
23801 // Attempt to create the X86ISD::BT node.
23802 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
23803 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23804 return BT;
23805 }
23806
23807 return SDValue();
23808}
23809
23810// Check if pre-AVX condcode can be performed by a single FCMP op.
23811static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
23812 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
23813}
23814
23815/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23816/// CMPs.
23817static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23818 SDValue &Op1, bool &IsAlwaysSignaling) {
23819 unsigned SSECC;
23820 bool Swap = false;
23821
23822 // SSE Condition code mapping:
23823 // 0 - EQ
23824 // 1 - LT
23825 // 2 - LE
23826 // 3 - UNORD
23827 // 4 - NEQ
23828 // 5 - NLT
23829 // 6 - NLE
23830 // 7 - ORD
23831 switch (SetCCOpcode) {
23832 // clang-format off
23833 default: llvm_unreachable("Unexpected SETCC condition");
23834 case ISD::SETOEQ:
23835 case ISD::SETEQ: SSECC = 0; break;
23836 case ISD::SETOGT:
23837 case ISD::SETGT: Swap = true; [[fallthrough]];
23838 case ISD::SETLT:
23839 case ISD::SETOLT: SSECC = 1; break;
23840 case ISD::SETOGE:
23841 case ISD::SETGE: Swap = true; [[fallthrough]];
23842 case ISD::SETLE:
23843 case ISD::SETOLE: SSECC = 2; break;
23844 case ISD::SETUO: SSECC = 3; break;
23845 case ISD::SETUNE:
23846 case ISD::SETNE: SSECC = 4; break;
23847 case ISD::SETULE: Swap = true; [[fallthrough]];
23848 case ISD::SETUGE: SSECC = 5; break;
23849 case ISD::SETULT: Swap = true; [[fallthrough]];
23850 case ISD::SETUGT: SSECC = 6; break;
23851 case ISD::SETO: SSECC = 7; break;
23852 case ISD::SETUEQ: SSECC = 8; break;
23853 case ISD::SETONE: SSECC = 12; break;
23854 // clang-format on
23855 }
23856 if (Swap)
23857 std::swap(Op0, Op1);
23858
23859 switch (SetCCOpcode) {
23860 default:
23861 IsAlwaysSignaling = true;
23862 break;
23863 case ISD::SETEQ:
23864 case ISD::SETOEQ:
23865 case ISD::SETUEQ:
23866 case ISD::SETNE:
23867 case ISD::SETONE:
23868 case ISD::SETUNE:
23869 case ISD::SETO:
23870 case ISD::SETUO:
23871 IsAlwaysSignaling = false;
23872 break;
23873 }
23874
23875 return SSECC;
23876}
23877
23878/// Break a VSETCC 256/512-bit vector into two new 128/256 ones and then
23879/// concatenate the result back.
23881 SelectionDAG &DAG, const SDLoc &dl) {
23882 assert(VT.isInteger() && LHS.getValueType() == RHS.getValueType() &&
23883 "Unsupported VTs!");
23884 SDValue CC = DAG.getCondCode(Cond);
23885
23886 // Extract the LHS Lo/Hi vectors
23887 SDValue LHS1, LHS2;
23888 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23889
23890 // Extract the RHS Lo/Hi vectors
23891 SDValue RHS1, RHS2;
23892 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23893
23894 // Issue the operation on the smaller types and concatenate the result back
23895 EVT LoVT, HiVT;
23896 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23897 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23898 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23899 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23900}
23901
23903 SelectionDAG &DAG) {
23904 SDValue Op0 = Op.getOperand(0);
23905 SDValue Op1 = Op.getOperand(1);
23906 SDValue CC = Op.getOperand(2);
23907 MVT VT = Op.getSimpleValueType();
23908 assert(VT.getVectorElementType() == MVT::i1 &&
23909 "Cannot set masked compare for this operation");
23910
23911 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23912
23913 // Prefer SETGT over SETLT.
23914 if (SetCCOpcode == ISD::SETLT) {
23915 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23916 std::swap(Op0, Op1);
23917 }
23918
23919 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23920}
23921
23922/// Given a buildvector constant, return a new vector constant with each element
23923/// incremented or decremented. If incrementing or decrementing would result in
23924/// unsigned overflow or underflow or this is not a simple vector constant,
23925/// return an empty value.
23927 bool NSW) {
23928 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23929 if (!BV || !V.getValueType().isSimple())
23930 return SDValue();
23931
23932 MVT VT = V.getSimpleValueType();
23933 MVT EltVT = VT.getVectorElementType();
23934 unsigned NumElts = VT.getVectorNumElements();
23936 SDLoc DL(V);
23937 for (unsigned i = 0; i < NumElts; ++i) {
23938 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23939 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23940 return SDValue();
23941
23942 // Avoid overflow/underflow.
23943 const APInt &EltC = Elt->getAPIntValue();
23944 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23945 return SDValue();
23946 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
23947 (!IsInc && EltC.isMinSignedValue())))
23948 return SDValue();
23949
23950 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23951 }
23952
23953 return DAG.getBuildVector(VT, DL, NewVecC);
23954}
23955
23956/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23957/// Op0 u<= Op1:
23958/// t = psubus Op0, Op1
23959/// pcmpeq t, <0..0>
23961 ISD::CondCode Cond, const SDLoc &dl,
23962 const X86Subtarget &Subtarget,
23963 SelectionDAG &DAG) {
23964 if (!Subtarget.hasSSE2())
23965 return SDValue();
23966
23967 MVT VET = VT.getVectorElementType();
23968 if (VET != MVT::i8 && VET != MVT::i16)
23969 return SDValue();
23970
23971 switch (Cond) {
23972 default:
23973 return SDValue();
23974 case ISD::SETULT: {
23975 // If the comparison is against a constant we can turn this into a
23976 // setule. With psubus, setule does not require a swap. This is
23977 // beneficial because the constant in the register is no longer
23978 // destructed as the destination so it can be hoisted out of a loop.
23979 // Only do this pre-AVX since vpcmp* is no longer destructive.
23980 if (Subtarget.hasAVX())
23981 return SDValue();
23982 SDValue ULEOp1 =
23983 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
23984 if (!ULEOp1)
23985 return SDValue();
23986 Op1 = ULEOp1;
23987 break;
23988 }
23989 case ISD::SETUGT: {
23990 // If the comparison is against a constant, we can turn this into a setuge.
23991 // This is beneficial because materializing a constant 0 for the PCMPEQ is
23992 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
23993 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23994 SDValue UGEOp1 =
23995 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
23996 if (!UGEOp1)
23997 return SDValue();
23998 Op1 = Op0;
23999 Op0 = UGEOp1;
24000 break;
24001 }
24002 // Psubus is better than flip-sign because it requires no inversion.
24003 case ISD::SETUGE:
24004 std::swap(Op0, Op1);
24005 break;
24006 case ISD::SETULE:
24007 break;
24008 }
24009
24010 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
24011 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
24012 DAG.getConstant(0, dl, VT));
24013}
24014
24015static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
24016 SelectionDAG &DAG) {
24017 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24018 Op.getOpcode() == ISD::STRICT_FSETCCS;
24019 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24020 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24021 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
24022 MVT VT = Op->getSimpleValueType(0);
24024 MVT OpVT = Op0.getSimpleValueType();
24025 SDLoc dl(Op);
24026
24027 if (OpVT.isFloatingPoint()) {
24028 MVT EltVT = OpVT.getVectorElementType();
24029 assert(EltVT == MVT::bf16 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
24030 EltVT == MVT::f64);
24031
24032 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24033 if (isSoftF16(EltVT, Subtarget)) {
24034 if (Subtarget.hasAVX512() && !Subtarget.hasVLX())
24035 return SDValue();
24036
24037 // Break 256-bit FP vector compare into smaller ones.
24038 if (OpVT.is256BitVector() && !Subtarget.useAVX512Regs())
24039 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24040
24041 // Break 512-bit FP vector compare into smaller ones.
24042 if (OpVT.is512BitVector())
24043 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24044
24045 MVT NVT = OpVT.changeVectorElementType(MVT::f32);
24046 if (IsStrict) {
24047 Op0 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24048 {Chain, Op0});
24049 Op1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24050 {Chain, Op1});
24051 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
24052 {Chain, Op0, Op1, CC});
24053 }
24054 MVT DVT = VT.getVectorElementType() == MVT::i16
24055 ? VT.changeVectorElementType(MVT::i32)
24056 : VT;
24057 SDValue Cmp = DAG.getNode(Op.getOpcode(), dl, DVT,
24058 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op0),
24059 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op1), CC);
24060 return DVT == VT ? Cmp : DAG.getNode(ISD::TRUNCATE, dl, VT, Cmp);
24061 }
24062
24063 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24064
24065 // If we have a strict compare with a vXi1 result and the input is 128/256
24066 // bits we can't use a masked compare unless we have VLX. If we use a wider
24067 // compare like we do for non-strict, we might trigger spurious exceptions
24068 // from the upper elements. Instead emit a AVX compare and convert to mask.
24069 unsigned Opc;
24070 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
24071 (!IsStrict || Subtarget.hasVLX() ||
24073#ifndef NDEBUG
24074 unsigned Num = VT.getVectorNumElements();
24075 assert(Num <= 16 ||
24076 (Num == 32 && (EltVT == MVT::f16 || EltVT == MVT::bf16)));
24077#endif
24078 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
24079 } else {
24080 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
24081 // The SSE/AVX packed FP comparison nodes are defined with a
24082 // floating-point vector result that matches the operand type. This allows
24083 // them to work with an SSE1 target (integer vector types are not legal).
24084 VT = Op0.getSimpleValueType();
24085 }
24086
24087 SDValue Cmp;
24088 bool IsAlwaysSignaling;
24089 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
24090 if (!Subtarget.hasAVX()) {
24091 // TODO: We could use following steps to handle a quiet compare with
24092 // signaling encodings.
24093 // 1. Get ordered masks from a quiet ISD::SETO
24094 // 2. Use the masks to mask potential unordered elements in operand A, B
24095 // 3. Get the compare results of masked A, B
24096 // 4. Calculating final result using the mask and result from 3
24097 // But currently, we just fall back to scalar operations.
24098 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
24099 return SDValue();
24100
24101 // Insert an extra signaling instruction to raise exception.
24102 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
24103 SDValue SignalCmp = DAG.getNode(
24104 Opc, dl, {VT, MVT::Other},
24105 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
24106 // FIXME: It seems we need to update the flags of all new strict nodes.
24107 // Otherwise, mayRaiseFPException in MI will return false due to
24108 // NoFPExcept = false by default. However, I didn't find it in other
24109 // patches.
24110 SignalCmp->setFlags(Op->getFlags());
24111 Chain = SignalCmp.getValue(1);
24112 }
24113
24114 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
24115 // emit two comparisons and a logic op to tie them together.
24116 if (!cheapX86FSETCC_SSE(Cond)) {
24117 // LLVM predicate is SETUEQ or SETONE.
24118 unsigned CC0, CC1;
24119 unsigned CombineOpc;
24120 if (Cond == ISD::SETUEQ) {
24121 CC0 = 3; // UNORD
24122 CC1 = 0; // EQ
24123 CombineOpc = X86ISD::FOR;
24124 } else {
24126 CC0 = 7; // ORD
24127 CC1 = 4; // NEQ
24128 CombineOpc = X86ISD::FAND;
24129 }
24130
24131 SDValue Cmp0, Cmp1;
24132 if (IsStrict) {
24133 Cmp0 = DAG.getNode(
24134 Opc, dl, {VT, MVT::Other},
24135 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
24136 Cmp1 = DAG.getNode(
24137 Opc, dl, {VT, MVT::Other},
24138 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
24139 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
24140 Cmp1.getValue(1));
24141 } else {
24142 Cmp0 = DAG.getNode(
24143 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
24144 Cmp1 = DAG.getNode(
24145 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
24146 }
24147 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
24148 } else {
24149 if (IsStrict) {
24150 Cmp = DAG.getNode(
24151 Opc, dl, {VT, MVT::Other},
24152 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24153 Chain = Cmp.getValue(1);
24154 } else
24155 Cmp = DAG.getNode(
24156 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24157 }
24158 } else {
24159 // Handle all other FP comparisons here.
24160 if (IsStrict) {
24161 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
24162 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
24163 Cmp = DAG.getNode(
24164 Opc, dl, {VT, MVT::Other},
24165 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24166 Chain = Cmp.getValue(1);
24167 } else
24168 Cmp = DAG.getNode(
24169 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24170 }
24171
24172 if (VT.getFixedSizeInBits() >
24173 Op.getSimpleValueType().getFixedSizeInBits()) {
24174 // We emitted a compare with an XMM/YMM result. Finish converting to a
24175 // mask register using a vptestm.
24177 Cmp = DAG.getBitcast(CastVT, Cmp);
24178 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
24179 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
24180 } else {
24181 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
24182 // the result type of SETCC. The bitcast is expected to be optimized
24183 // away during combining/isel.
24184 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
24185 }
24186
24187 if (IsStrict)
24188 return DAG.getMergeValues({Cmp, Chain}, dl);
24189
24190 return Cmp;
24191 }
24192
24193 assert(!IsStrict && "Strict SETCC only handles FP operands.");
24194
24195 [[maybe_unused]] MVT VTOp0 = Op0.getSimpleValueType();
24196 assert(VTOp0 == Op1.getSimpleValueType() &&
24197 "Expected operands with same type!");
24199 "Invalid number of packed elements for source and destination!");
24200
24201 // The non-AVX512 code below works under the assumption that source and
24202 // destination types are the same.
24203 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
24204 "Value types for source and destination must be the same!");
24205
24206 // The result is boolean, but operands are int/float
24207 if (VT.getVectorElementType() == MVT::i1) {
24208 // In AVX-512 architecture setcc returns mask with i1 elements,
24209 // But there is no compare instruction for i8 and i16 elements in KNL.
24210 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
24211 "Unexpected operand type");
24212 return LowerIntVSETCC_AVX512(Op, dl, DAG);
24213 }
24214
24215 // Lower using XOP integer comparisons.
24216 if (VT.is128BitVector() && Subtarget.hasXOP()) {
24217 // Translate compare code to XOP PCOM compare mode.
24218 unsigned CmpMode = 0;
24219 switch (Cond) {
24220 // clang-format off
24221 default: llvm_unreachable("Unexpected SETCC condition");
24222 case ISD::SETULT:
24223 case ISD::SETLT: CmpMode = 0x00; break;
24224 case ISD::SETULE:
24225 case ISD::SETLE: CmpMode = 0x01; break;
24226 case ISD::SETUGT:
24227 case ISD::SETGT: CmpMode = 0x02; break;
24228 case ISD::SETUGE:
24229 case ISD::SETGE: CmpMode = 0x03; break;
24230 case ISD::SETEQ: CmpMode = 0x04; break;
24231 case ISD::SETNE: CmpMode = 0x05; break;
24232 // clang-format on
24233 }
24234
24235 // Are we comparing unsigned or signed integers?
24236 unsigned Opc =
24238
24239 return DAG.getNode(Opc, dl, VT, Op0, Op1,
24240 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
24241 }
24242
24243 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
24244 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
24246 SDValue BC0 = peekThroughBitcasts(Op0);
24247 if (BC0.getOpcode() == ISD::AND &&
24249 /*AllowUndefs=*/false)) {
24250 Cond = ISD::SETEQ;
24251 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
24252 }
24253 }
24254
24255 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
24256 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
24257 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
24259 if (C1 && C1->getAPIntValue().isPowerOf2()) {
24260 unsigned BitWidth = VT.getScalarSizeInBits();
24261 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
24262
24263 SDValue Result = Op0.getOperand(0);
24264 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
24265 DAG.getConstant(ShiftAmt, dl, VT));
24266 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
24267 DAG.getConstant(BitWidth - 1, dl, VT));
24268 return Result;
24269 }
24270 }
24271
24272 // Break 256-bit integer vector compare into smaller ones.
24273 if (VT.is256BitVector() && !Subtarget.hasInt256())
24274 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24275
24276 // Break 512-bit integer vector compare into smaller ones.
24277 // TODO: Try harder to use VPCMPx + VPMOV2x?
24278 if (VT.is512BitVector())
24279 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24280
24281 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
24282 // not-of-PCMPEQ:
24283 // X != INT_MIN --> X >s INT_MIN
24284 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
24285 // +X != 0 --> +X >s 0
24286 APInt ConstValue;
24287 if (Cond == ISD::SETNE &&
24288 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
24289 if (ConstValue.isMinSignedValue())
24290 Cond = ISD::SETGT;
24291 else if (ConstValue.isMaxSignedValue())
24292 Cond = ISD::SETLT;
24293 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
24294 Cond = ISD::SETGT;
24295 }
24296
24297 // If both operands are known non-negative, then an unsigned compare is the
24298 // same as a signed compare and there's no need to flip signbits.
24299 // TODO: We could check for more general simplifications here since we're
24300 // computing known bits.
24301 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
24302 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
24303
24304 // Special case: Use min/max operations for unsigned compares.
24305 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24307 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
24308 TLI.isOperationLegal(ISD::UMIN, VT)) {
24309 // If we have a constant operand, increment/decrement it and change the
24310 // condition to avoid an invert.
24311 if (Cond == ISD::SETUGT) {
24312 // X > C --> X >= (C+1) --> X == umax(X, C+1)
24313 if (SDValue UGTOp1 =
24314 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
24315 Op1 = UGTOp1;
24316 Cond = ISD::SETUGE;
24317 }
24318 }
24319 if (Cond == ISD::SETULT) {
24320 // X < C --> X <= (C-1) --> X == umin(X, C-1)
24321 if (SDValue ULTOp1 =
24322 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
24323 Op1 = ULTOp1;
24324 Cond = ISD::SETULE;
24325 }
24326 }
24327 bool Invert = false;
24328 unsigned Opc;
24329 switch (Cond) {
24330 // clang-format off
24331 default: llvm_unreachable("Unexpected condition code");
24332 case ISD::SETUGT: Invert = true; [[fallthrough]];
24333 case ISD::SETULE: Opc = ISD::UMIN; break;
24334 case ISD::SETULT: Invert = true; [[fallthrough]];
24335 case ISD::SETUGE: Opc = ISD::UMAX; break;
24336 // clang-format on
24337 }
24338
24339 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24340 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
24341
24342 // If the logical-not of the result is required, perform that now.
24343 if (Invert)
24344 Result = DAG.getNOT(dl, Result, VT);
24345
24346 return Result;
24347 }
24348
24349 // Try to use SUBUS and PCMPEQ.
24350 if (FlipSigns)
24351 if (SDValue V =
24352 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
24353 return V;
24354
24355 // We are handling one of the integer comparisons here. Since SSE only has
24356 // GT and EQ comparisons for integer, swapping operands and multiple
24357 // operations may be required for some comparisons.
24358 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
24360 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
24362 bool Invert = Cond == ISD::SETNE ||
24364
24365 if (Swap)
24366 std::swap(Op0, Op1);
24367
24368 // Check that the operation in question is available (most are plain SSE2,
24369 // but PCMPGTQ and PCMPEQQ have different requirements).
24370 if (VT == MVT::v2i64) {
24371 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
24372 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
24373
24374 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
24375 // the odd elements over the even elements.
24376 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
24377 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
24378 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24379
24380 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24381 static const int MaskHi[] = { 1, 1, 3, 3 };
24382 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24383
24384 return DAG.getBitcast(VT, Result);
24385 }
24386
24387 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
24388 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24389 Op1 = DAG.getAllOnesConstant(dl, MVT::v4i32);
24390
24391 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24392 static const int MaskHi[] = { 1, 1, 3, 3 };
24393 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24394
24395 return DAG.getBitcast(VT, Result);
24396 }
24397
24398 // If the i64 elements are sign-extended enough to be representable as i32
24399 // then we can compare the lower i32 bits and splat.
24400 if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 &&
24401 DAG.ComputeNumSignBits(Op1) > 32) {
24402 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24403 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24404
24405 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24406 static const int MaskLo[] = {0, 0, 2, 2};
24407 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24408
24409 return DAG.getBitcast(VT, Result);
24410 }
24411
24412 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24413 // bits of the inputs before performing those operations. The lower
24414 // compare is always unsigned.
24415 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
24416 : 0x0000000080000000ULL,
24417 dl, MVT::v2i64);
24418
24419 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
24420 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
24421
24422 // Cast everything to the right type.
24423 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24424 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24425
24426 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
24427 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24428 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
24429
24430 // Create masks for only the low parts/high parts of the 64 bit integers.
24431 static const int MaskHi[] = { 1, 1, 3, 3 };
24432 static const int MaskLo[] = { 0, 0, 2, 2 };
24433 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
24434 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24435 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24436
24437 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
24438 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
24439
24440 if (Invert)
24441 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24442
24443 return DAG.getBitcast(VT, Result);
24444 }
24445
24446 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
24447 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
24448 // pcmpeqd + pshufd + pand.
24449 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
24450
24451 // First cast everything to the right type.
24452 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24453 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24454
24455 // Do the compare.
24456 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
24457
24458 // Make sure the lower and upper halves are both all-ones.
24459 static const int Mask[] = { 1, 0, 3, 2 };
24460 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
24461 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
24462
24463 if (Invert)
24464 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24465
24466 return DAG.getBitcast(VT, Result);
24467 }
24468 }
24469
24470 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24471 // bits of the inputs before performing those operations.
24472 if (FlipSigns) {
24473 MVT EltVT = VT.getVectorElementType();
24475 VT);
24476 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
24477 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
24478 }
24479
24480 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24481
24482 // If the logical-not of the result is required, perform that now.
24483 if (Invert)
24484 Result = DAG.getNOT(dl, Result, VT);
24485
24486 return Result;
24487}
24488
24489// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
24491 const SDLoc &dl, SelectionDAG &DAG,
24492 const X86Subtarget &Subtarget,
24493 SDValue &X86CC) {
24494 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
24495
24496 // Must be a bitcast from vXi1.
24497 if (Op0.getOpcode() != ISD::BITCAST)
24498 return SDValue();
24499
24500 Op0 = Op0.getOperand(0);
24501 MVT VT = Op0.getSimpleValueType();
24502 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
24503 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
24504 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
24505 return SDValue();
24506
24507 X86::CondCode X86Cond;
24508 if (isNullConstant(Op1)) {
24509 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
24510 } else if (isAllOnesConstant(Op1)) {
24511 // C flag is set for all ones.
24512 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
24513 } else
24514 return SDValue();
24515
24516 // If the input is an AND, we can combine it's operands into the KTEST.
24517 bool KTestable = false;
24518 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
24519 KTestable = true;
24520 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
24521 KTestable = true;
24522 if (!isNullConstant(Op1))
24523 KTestable = false;
24524 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
24525 SDValue LHS = Op0.getOperand(0);
24526 SDValue RHS = Op0.getOperand(1);
24527 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24528 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
24529 }
24530
24531 // If the input is an OR, we can combine it's operands into the KORTEST.
24532 SDValue LHS = Op0;
24533 SDValue RHS = Op0;
24534 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
24535 LHS = Op0.getOperand(0);
24536 RHS = Op0.getOperand(1);
24537 }
24538
24539 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24540 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
24541}
24542
24543/// Emit flags for the given setcc condition and operands. Also returns the
24544/// corresponding X86 condition code constant in X86CC.
24545SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
24546 ISD::CondCode CC, const SDLoc &dl,
24547 SelectionDAG &DAG,
24548 SDValue &X86CC) const {
24549 // Equality Combines.
24550 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
24551 X86::CondCode X86CondCode;
24552
24553 // Optimize to BT if possible.
24554 // Lower (X & (1 << N)) == 0 to BT(X, N).
24555 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
24556 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
24557 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
24558 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
24559 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24560 return BT;
24561 }
24562 }
24563
24564 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
24565 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
24566 X86CondCode)) {
24567 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24568 return CmpZ;
24569 }
24570
24571 // Try to lower using KORTEST or KTEST.
24572 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
24573 return Test;
24574
24575 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
24576 // of these.
24577 if (isOneConstant(Op1) || isNullConstant(Op1)) {
24578 // If the input is a setcc, then reuse the input setcc or use a new one
24579 // with the inverted condition.
24580 if (Op0.getOpcode() == X86ISD::SETCC) {
24581 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
24582
24583 X86CC = Op0.getOperand(0);
24584 if (Invert) {
24585 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
24586 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
24587 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24588 }
24589
24590 return Op0.getOperand(1);
24591 }
24592 }
24593
24594 // Look for X == INT_MIN or X != INT_MIN. We can use NEG and test for
24595 // overflow.
24596 if (isMinSignedConstant(Op1)) {
24597 EVT VT = Op0.getValueType();
24598 if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) {
24599 SDVTList CmpVTs = DAG.getVTList(VT, MVT::i32);
24601 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24602 SDValue Neg = DAG.getNode(X86ISD::SUB, dl, CmpVTs,
24603 DAG.getConstant(0, dl, VT), Op0);
24604 return SDValue(Neg.getNode(), 1);
24605 }
24606 }
24607
24608 // Try to use the carry flag from the add in place of an separate CMP for:
24609 // (seteq (add X, -1), -1). Similar for setne.
24610 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
24611 Op0.getOperand(1) == Op1) {
24612 if (isProfitableToUseFlagOp(Op0)) {
24613 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
24614
24615 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
24616 Op0.getOperand(1));
24617 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
24618 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
24619 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24620 return SDValue(New.getNode(), 1);
24621 }
24622 }
24623 }
24624
24626 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
24627 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
24628
24629 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
24630 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24631 return EFLAGS;
24632}
24633
24634SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
24635
24636 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24637 Op.getOpcode() == ISD::STRICT_FSETCCS;
24638 MVT VT = Op->getSimpleValueType(0);
24639
24640 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
24641
24642 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
24643 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24644 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24645 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24646 SDLoc dl(Op);
24647 ISD::CondCode CC =
24648 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
24649
24650 if (isSoftF16(Op0.getValueType(), Subtarget))
24651 return SDValue();
24652
24653 // Handle f128 first, since one possible outcome is a normal integer
24654 // comparison which gets handled by emitFlagsForSetcc.
24655 if (Op0.getValueType() == MVT::f128) {
24656 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
24657 Op.getOpcode() == ISD::STRICT_FSETCCS);
24658
24659 // If softenSetCCOperands returned a scalar, use it.
24660 if (!Op1.getNode()) {
24661 assert(Op0.getValueType() == Op.getValueType() &&
24662 "Unexpected setcc expansion!");
24663 if (IsStrict)
24664 return DAG.getMergeValues({Op0, Chain}, dl);
24665 return Op0;
24666 }
24667 }
24668
24669 if (Op0.getSimpleValueType().isInteger()) {
24670 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
24671 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
24672 // this may translate to less uops depending on uarch implementation. The
24673 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
24674 // canonicalize to that CondCode.
24675 // NOTE: Only do this if incrementing the constant doesn't increase the bit
24676 // encoding size - so it must either already be a i8 or i32 immediate, or it
24677 // shrinks down to that. We don't do this for any i64's to avoid additional
24678 // constant materializations.
24679 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
24680 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
24681 const APInt &Op1Val = Op1C->getAPIntValue();
24682 if (!Op1Val.isZero()) {
24683 // Ensure the constant+1 doesn't overflow.
24684 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
24685 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
24686 APInt Op1ValPlusOne = Op1Val + 1;
24687 if (Op1ValPlusOne.isSignedIntN(32) &&
24688 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
24689 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
24692 }
24693 }
24694 }
24695 }
24696
24697 SDValue X86CC;
24698 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
24699 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24700 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24701 }
24702
24703 if (Subtarget.hasAVX10_2()) {
24704 if (CC == ISD::SETOEQ || CC == ISD::SETUNE) {
24705 auto NewCC = (CC == ISD::SETOEQ) ? X86::COND_E : (X86::COND_NE);
24706 assert(Op0.getSimpleValueType() != MVT::bf16 && "Unsupported Type");
24707 if (Op0.getSimpleValueType() != MVT::f80) {
24708 SDValue Res = getSETCC(
24709 NewCC, DAG.getNode(X86ISD::UCOMX, dl, MVT::i32, Op0, Op1), dl, DAG);
24710 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24711 }
24712 }
24713 }
24714 // Handle floating point.
24715 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
24716 if (CondCode == X86::COND_INVALID)
24717 return SDValue();
24718
24719 SDValue EFLAGS;
24720 if (IsStrict) {
24721 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24722 EFLAGS =
24724 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
24725 Chain = EFLAGS.getValue(1);
24726 } else {
24727 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
24728 }
24729
24730 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24731 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24732 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24733}
24734
24735SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
24736 SDValue LHS = Op.getOperand(0);
24737 SDValue RHS = Op.getOperand(1);
24738 SDValue Carry = Op.getOperand(2);
24739 SDValue Cond = Op.getOperand(3);
24740 SDLoc DL(Op);
24741
24742 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
24744
24745 // Recreate the carry if needed.
24746 EVT CarryVT = Carry.getValueType();
24747 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24748 Carry, DAG.getAllOnesConstant(DL, CarryVT));
24749
24750 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
24751 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
24752 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
24753}
24754
24755// This function returns three things: the arithmetic computation itself
24756// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
24757// flag and the condition code define the case in which the arithmetic
24758// computation overflows.
24759static std::pair<SDValue, SDValue>
24761 assert(Op.getResNo() == 0 && "Unexpected result number!");
24762 SDValue Value, Overflow;
24763 SDValue LHS = Op.getOperand(0);
24764 SDValue RHS = Op.getOperand(1);
24765 unsigned BaseOp = 0;
24766 SDLoc DL(Op);
24767 switch (Op.getOpcode()) {
24768 default: llvm_unreachable("Unknown ovf instruction!");
24769 case ISD::SADDO:
24770 BaseOp = X86ISD::ADD;
24771 Cond = X86::COND_O;
24772 break;
24773 case ISD::UADDO:
24774 BaseOp = X86ISD::ADD;
24776 break;
24777 case ISD::SSUBO:
24778 BaseOp = X86ISD::SUB;
24779 Cond = X86::COND_O;
24780 break;
24781 case ISD::USUBO:
24782 BaseOp = X86ISD::SUB;
24783 Cond = X86::COND_B;
24784 break;
24785 case ISD::SMULO:
24786 BaseOp = X86ISD::SMUL;
24787 Cond = X86::COND_O;
24788 break;
24789 case ISD::UMULO:
24790 BaseOp = X86ISD::UMUL;
24791 Cond = X86::COND_O;
24792 break;
24793 }
24794
24795 if (BaseOp) {
24796 // Also sets EFLAGS.
24797 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24798 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24799 Overflow = Value.getValue(1);
24800 }
24801
24802 return std::make_pair(Value, Overflow);
24803}
24804
24806 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24807 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24808 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24809 // has only one use.
24810 SDLoc DL(Op);
24812 SDValue Value, Overflow;
24813 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
24814
24815 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
24816 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
24817 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24818}
24819
24820/// Return true if opcode is a X86 logical comparison.
24822 unsigned Opc = Op.getOpcode();
24823 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
24824 Opc == X86ISD::FCMP)
24825 return true;
24826 if (Op.getResNo() == 1 &&
24827 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
24829 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
24830 return true;
24831
24832 return false;
24833}
24834
24836 if (V.getOpcode() != ISD::TRUNCATE)
24837 return false;
24838
24839 SDValue VOp0 = V.getOperand(0);
24840 unsigned InBits = VOp0.getValueSizeInBits();
24841 unsigned Bits = V.getValueSizeInBits();
24842 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24843}
24844
24845// Lower various (select (icmp CmpVal, 0), LHS, RHS) custom patterns.
24847 unsigned X86CC, const SDLoc &DL,
24848 SelectionDAG &DAG,
24849 const X86Subtarget &Subtarget) {
24850 EVT CmpVT = CmpVal.getValueType();
24851 EVT VT = LHS.getValueType();
24852 if (!CmpVT.isScalarInteger() || !VT.isScalarInteger())
24853 return SDValue();
24854
24855 if (X86CC == X86::COND_E && CmpVal.getOpcode() == ISD::AND &&
24856 isOneConstant(CmpVal.getOperand(1))) {
24857 auto SplatLSB = [&](EVT SplatVT) {
24858 // we need mask of all zeros or ones with same size of the other
24859 // operands.
24860 SDValue Neg = CmpVal;
24861 if (CmpVT.bitsGT(SplatVT))
24862 Neg = DAG.getNode(ISD::TRUNCATE, DL, SplatVT, CmpVal);
24863 else if (CmpVT.bitsLT(SplatVT))
24864 Neg = DAG.getNode(
24865 ISD::AND, DL, SplatVT,
24866 DAG.getNode(ISD::ANY_EXTEND, DL, SplatVT, CmpVal.getOperand(0)),
24867 DAG.getConstant(1, DL, SplatVT));
24868 return DAG.getNegative(Neg, DL, SplatVT); // -(and (x, 0x1))
24869 };
24870
24871 // SELECT (AND(X,1) == 0), 0, -1 -> NEG(AND(X,1))
24873 return SplatLSB(VT);
24874
24875 // SELECT (AND(X,1) == 0), C1, C2 -> XOR(C1,AND(NEG(AND(X,1)),XOR(C1,C2))
24876 if (!Subtarget.canUseCMOV() && isa<ConstantSDNode>(LHS) &&
24878 SDValue Mask = SplatLSB(VT);
24879 SDValue Diff = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
24880 SDValue Flip = DAG.getNode(ISD::AND, DL, VT, Mask, Diff);
24881 return DAG.getNode(ISD::XOR, DL, VT, LHS, Flip);
24882 }
24883
24884 SDValue Src1, Src2;
24885 auto isIdentityPatternZero = [&]() {
24886 switch (RHS.getOpcode()) {
24887 default:
24888 break;
24889 case ISD::OR:
24890 case ISD::XOR:
24891 case ISD::ADD:
24892 if (RHS.getOperand(0) == LHS || RHS.getOperand(1) == LHS) {
24893 Src1 = RHS.getOperand(RHS.getOperand(0) == LHS ? 1 : 0);
24894 Src2 = LHS;
24895 return true;
24896 }
24897 break;
24898 case ISD::SHL:
24899 case ISD::SRA:
24900 case ISD::SRL:
24901 case ISD::SUB:
24902 if (RHS.getOperand(0) == LHS) {
24903 Src1 = RHS.getOperand(1);
24904 Src2 = LHS;
24905 return true;
24906 }
24907 break;
24908 }
24909 return false;
24910 };
24911
24912 auto isIdentityPatternOnes = [&]() {
24913 switch (LHS.getOpcode()) {
24914 default:
24915 break;
24916 case ISD::AND:
24917 if (LHS.getOperand(0) == RHS || LHS.getOperand(1) == RHS) {
24918 Src1 = LHS.getOperand(LHS.getOperand(0) == RHS ? 1 : 0);
24919 Src2 = RHS;
24920 return true;
24921 }
24922 break;
24923 }
24924 return false;
24925 };
24926
24927 // Convert 'identity' patterns (iff X is 0 or 1):
24928 // SELECT (AND(X,1) == 0), Y, (OR Y, Z) -> (OR Y, (AND NEG(AND(X,1)), Z))
24929 // SELECT (AND(X,1) == 0), Y, (XOR Y, Z) -> (XOR Y, (AND NEG(AND(X,1)), Z))
24930 // SELECT (AND(X,1) == 0), Y, (ADD Y, Z) -> (ADD Y, (AND NEG(AND(X,1)), Z))
24931 // SELECT (AND(X,1) == 0), Y, (SUB Y, Z) -> (SUB Y, (AND NEG(AND(X,1)), Z))
24932 // SELECT (AND(X,1) == 0), Y, (SHL Y, Z) -> (SHL Y, (AND NEG(AND(X,1)), Z))
24933 // SELECT (AND(X,1) == 0), Y, (SRA Y, Z) -> (SRA Y, (AND NEG(AND(X,1)), Z))
24934 // SELECT (AND(X,1) == 0), Y, (SRL Y, Z) -> (SRL Y, (AND NEG(AND(X,1)), Z))
24935 if (!Subtarget.canUseCMOV() && isIdentityPatternZero()) {
24936 SDValue Mask = SplatLSB(Src1.getValueType());
24937 SDValue And = DAG.getNode(ISD::AND, DL, Src1.getValueType(), Mask,
24938 Src1); // Mask & z
24939 return DAG.getNode(RHS.getOpcode(), DL, VT, Src2, And); // y Op And
24940 }
24941 // SELECT (AND(X,1) == 0), (AND Y, Z), Y -> (AND Y, (OR NEG(AND(X, 1)), Z))
24942 if (!Subtarget.canUseCMOV() && isIdentityPatternOnes()) {
24943 SDValue Mask = SplatLSB(VT);
24944 SDValue Or = DAG.getNode(ISD::OR, DL, VT, Mask, Src1); // Mask | z
24945 return DAG.getNode(LHS.getOpcode(), DL, VT, Src2, Or); // y Op Or
24946 }
24947 }
24948
24949 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
24952 SDVTList CmpVTs = DAG.getVTList(CmpVT, MVT::i32);
24953
24954 // 'X - 1' sets the carry flag if X == 0.
24955 // '0 - X' sets the carry flag if X != 0.
24956 // Convert the carry flag to a -1/0 mask with sbb:
24957 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24958 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24959 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24960 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24961 SDValue Sub;
24962 if (isAllOnesConstant(LHS) == (X86CC == X86::COND_NE)) {
24963 SDValue Zero = DAG.getConstant(0, DL, CmpVT);
24964 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpVal);
24965 } else {
24966 SDValue One = DAG.getConstant(1, DL, CmpVT);
24967 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpVal, One);
24968 }
24969 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
24970 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
24971 Sub.getValue(1));
24972 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
24973 }
24974
24975 return SDValue();
24976}
24977
24978SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
24979 bool AddTest = true;
24980 SDValue Cond = Op.getOperand(0);
24981 SDValue Op1 = Op.getOperand(1);
24982 SDValue Op2 = Op.getOperand(2);
24983 SDLoc DL(Op);
24984 MVT VT = Op1.getSimpleValueType();
24985 SDValue CC;
24986
24987 if (isSoftF16(VT, Subtarget)) {
24988 MVT NVT = VT.changeTypeToInteger();
24989 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
24990 DAG.getBitcast(NVT, Op1),
24991 DAG.getBitcast(NVT, Op2)));
24992 }
24993
24994 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
24995 // are available or VBLENDV if AVX is available.
24996 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
24997 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
24998 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
24999 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
25000 bool IsAlwaysSignaling;
25001 unsigned SSECC =
25002 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
25003 CondOp0, CondOp1, IsAlwaysSignaling);
25004
25005 if (Subtarget.hasAVX512()) {
25006 SDValue Cmp =
25007 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
25008 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25009 assert(!VT.isVector() && "Not a scalar type?");
25010 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25011 }
25012
25013 if (SSECC < 8 || Subtarget.hasAVX()) {
25014 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
25015 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25016
25017 // If we have SSE41/AVX, we can use a variable vector select (VBLENDV)
25018 // instead of 3 logic instructions for size savings and potentially speed.
25019 // Unfortunately, there is no scalar form of VBLENDV.
25020 //
25021 // If either operand is a +0.0 constant, don't try this. We can expect to
25022 // optimize away at least one of the logic instructions later in that
25023 // case, so that sequence would be faster than a variable blend.
25024 if (Subtarget.hasSSE41() && !isNullFPConstant(Op1) &&
25025 !isNullFPConstant(Op2)) {
25026 // Convert to vectors, do a VSELECT, and convert back to scalar.
25027 // All of the conversions should be optimized away.
25028 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
25029 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
25030 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
25031 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
25032
25033 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
25034 VCmp = DAG.getBitcast(VCmpVT, VCmp);
25035
25036 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
25037
25038 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel,
25039 DAG.getVectorIdxConstant(0, DL));
25040 }
25041 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
25042 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
25043 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
25044 }
25045 }
25046
25047 // AVX512 fallback is to lower selects of scalar floats to masked moves.
25048 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
25049 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
25050 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25051 }
25052
25053 if (Cond.getOpcode() == ISD::SETCC &&
25054 !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
25055 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
25056 Cond = NewCond;
25057 // If the condition was updated, it's possible that the operands of the
25058 // select were also updated (for example, EmitTest has a RAUW). Refresh
25059 // the local references to the select operands in case they got stale.
25060 Op1 = Op.getOperand(1);
25061 Op2 = Op.getOperand(2);
25062 }
25063 }
25064
25065 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
25066 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
25067 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
25068 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
25069 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
25070 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
25071 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25072 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25073 if (Cond.getOpcode() == X86ISD::SETCC &&
25074 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
25075 isNullConstant(Cond.getOperand(1).getOperand(1))) {
25076 SDValue Cmp = Cond.getOperand(1);
25077 SDValue CmpOp0 = Cmp.getOperand(0);
25078 unsigned CondCode = Cond.getConstantOperandVal(0);
25079
25080 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
25081 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
25082 // handle to keep the CMP with 0. This should be removed by
25083 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
25084 // cttz_zero_undef.
25085 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
25086 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
25087 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
25088 };
25089 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
25090 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
25091 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
25092 // Keep Cmp.
25093 } else if (SDValue R = LowerSELECTWithCmpZero(CmpOp0, Op1, Op2, CondCode,
25094 DL, DAG, Subtarget)) {
25095 return R;
25096 } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
25097 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
25098 ((CondCode == X86::COND_S) || // smin(x, 0)
25099 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
25100 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25101 //
25102 // If the comparison is testing for a positive value, we have to invert
25103 // the sign bit mask, so only do that transform if the target has a
25104 // bitwise 'and not' instruction (the invert is free).
25105 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25106 unsigned ShCt = VT.getSizeInBits() - 1;
25107 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
25108 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
25109 if (CondCode == X86::COND_G)
25110 Shift = DAG.getNOT(DL, Shift, VT);
25111 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
25112 }
25113 }
25114
25115 // Look past (and (setcc_carry (cmp ...)), 1).
25116 if (Cond.getOpcode() == ISD::AND &&
25117 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
25118 isOneConstant(Cond.getOperand(1)))
25119 Cond = Cond.getOperand(0);
25120
25121 // Attempt to fold "raw cond" cases by treating them as:
25122 // (select (and X, 1), Op1, Op2 --> (select (icmpeq (and X, 1), 0), Op2, Op1)
25123 if (Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1)))
25124 if (SDValue R = LowerSELECTWithCmpZero(Cond, Op2, Op1, X86::COND_E, DL, DAG,
25125 Subtarget))
25126 return R;
25127
25128 // If condition flag is set by a X86ISD::CMP, then use it as the condition
25129 // setting operand in place of the X86ISD::SETCC.
25130 unsigned CondOpcode = Cond.getOpcode();
25131 if (CondOpcode == X86ISD::SETCC ||
25132 CondOpcode == X86ISD::SETCC_CARRY) {
25133 CC = Cond.getOperand(0);
25134
25135 SDValue Cmp = Cond.getOperand(1);
25136 bool IllegalFPCMov = false;
25137 if (VT.isFloatingPoint() && !VT.isVector() &&
25138 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
25139 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
25140
25141 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
25142 Cmp.getOpcode() == X86ISD::BT) { // FIXME
25143 Cond = Cmp;
25144 AddTest = false;
25145 }
25146 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
25147 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
25148 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
25149 SDValue Value;
25150 X86::CondCode X86Cond;
25151 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25152
25153 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
25154 AddTest = false;
25155 }
25156
25157 if (AddTest) {
25158 // Look past the truncate if the high bits are known zero.
25160 Cond = Cond.getOperand(0);
25161
25162 // We know the result of AND is compared against zero. Try to match
25163 // it to BT.
25164 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
25165 X86::CondCode X86CondCode;
25166 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
25167 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
25168 Cond = BT;
25169 AddTest = false;
25170 }
25171 }
25172 }
25173
25174 if (AddTest) {
25175 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
25176 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
25177 }
25178
25179 // a < b ? -1 : 0 -> RES = ~setcc_carry
25180 // a < b ? 0 : -1 -> RES = setcc_carry
25181 // a >= b ? -1 : 0 -> RES = setcc_carry
25182 // a >= b ? 0 : -1 -> RES = ~setcc_carry
25183 if (Cond.getOpcode() == X86ISD::SUB) {
25184 unsigned CondCode = CC->getAsZExtVal();
25185
25186 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
25187 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
25188 (isNullConstant(Op1) || isNullConstant(Op2))) {
25189 SDValue Res =
25190 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
25191 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
25192 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
25193 return DAG.getNOT(DL, Res, Res.getValueType());
25194 return Res;
25195 }
25196 }
25197
25198 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
25199 // widen the cmov and push the truncate through. This avoids introducing a new
25200 // branch during isel and doesn't add any extensions.
25201 if (Op.getValueType() == MVT::i8 &&
25202 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
25203 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
25204 if (T1.getValueType() == T2.getValueType() &&
25205 // Exclude CopyFromReg to avoid partial register stalls.
25206 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
25207 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
25208 CC, Cond);
25209 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25210 }
25211 }
25212
25213 // Or finally, promote i8 cmovs if we have CMOV,
25214 // or i16 cmovs if it won't prevent folding a load.
25215 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
25216 // legal, but EmitLoweredSelect() can not deal with these extensions
25217 // being inserted between two CMOV's. (in i16 case too TBN)
25218 // https://bugs.llvm.org/show_bug.cgi?id=40974
25219 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
25220 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
25221 !X86::mayFoldLoad(Op2, Subtarget))) {
25222 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
25223 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
25224 SDValue Ops[] = { Op2, Op1, CC, Cond };
25225 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
25226 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25227 }
25228
25229 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
25230 // condition is true.
25231 SDValue Ops[] = { Op2, Op1, CC, Cond };
25232 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
25233}
25234
25236 const X86Subtarget &Subtarget,
25237 SelectionDAG &DAG) {
25238 MVT VT = Op->getSimpleValueType(0);
25239 SDValue In = Op->getOperand(0);
25240 MVT InVT = In.getSimpleValueType();
25241 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
25242 MVT VTElt = VT.getVectorElementType();
25243 unsigned NumElts = VT.getVectorNumElements();
25244
25245 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
25246 MVT ExtVT = VT;
25247 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
25248 // If v16i32 is to be avoided, we'll need to split and concatenate.
25249 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
25250 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
25251
25252 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
25253 }
25254
25255 // Widen to 512-bits if VLX is not supported.
25256 MVT WideVT = ExtVT;
25257 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
25258 NumElts *= 512 / ExtVT.getSizeInBits();
25259 InVT = MVT::getVectorVT(MVT::i1, NumElts);
25260 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), In,
25261 DAG.getVectorIdxConstant(0, dl));
25262 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
25263 }
25264
25265 SDValue V;
25266 MVT WideEltVT = WideVT.getVectorElementType();
25267 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
25268 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
25269 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
25270 } else {
25271 SDValue NegOne = DAG.getAllOnesConstant(dl, WideVT);
25272 SDValue Zero = DAG.getConstant(0, dl, WideVT);
25273 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
25274 }
25275
25276 // Truncate if we had to extend i16/i8 above.
25277 if (VT != ExtVT) {
25278 WideVT = MVT::getVectorVT(VTElt, NumElts);
25279 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
25280 }
25281
25282 // Extract back to 128/256-bit if we widened.
25283 if (WideVT != VT)
25284 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
25285 DAG.getVectorIdxConstant(0, dl));
25286
25287 return V;
25288}
25289
25291 SelectionDAG &DAG) {
25292 SDValue In = Op->getOperand(0);
25293 MVT InVT = In.getSimpleValueType();
25294 SDLoc DL(Op);
25295
25296 if (InVT.getVectorElementType() == MVT::i1)
25297 return LowerSIGN_EXTEND_Mask(Op, DL, Subtarget, DAG);
25298
25299 assert(Subtarget.hasAVX() && "Expected AVX support");
25300 return LowerAVXExtend(Op, DL, DAG, Subtarget);
25301}
25302
25303// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
25304// For sign extend this needs to handle all vector sizes and SSE4.1 and
25305// non-SSE4.1 targets. For zero extend this should only handle inputs of
25306// MVT::v64i8 when BWI is not supported, but AVX512 is.
25308 const X86Subtarget &Subtarget,
25309 SelectionDAG &DAG) {
25310 SDValue In = Op->getOperand(0);
25311 MVT VT = Op->getSimpleValueType(0);
25312 MVT InVT = In.getSimpleValueType();
25313
25314 MVT SVT = VT.getVectorElementType();
25315 MVT InSVT = InVT.getVectorElementType();
25317
25318 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
25319 return SDValue();
25320 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
25321 return SDValue();
25322 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
25323 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
25324 !(VT.is512BitVector() && Subtarget.hasAVX512()))
25325 return SDValue();
25326
25327 SDLoc dl(Op);
25328 unsigned Opc = Op.getOpcode();
25329 unsigned NumElts = VT.getVectorNumElements();
25330
25331 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
25332 // For 512-bit vectors, we need 128-bits or 256-bits.
25333 if (InVT.getSizeInBits() > 128) {
25334 // Input needs to be at least the same number of elements as output, and
25335 // at least 128-bits.
25336 int InSize = InSVT.getSizeInBits() * NumElts;
25337 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
25338 InVT = In.getSimpleValueType();
25339 }
25340
25341 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
25342 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
25343 // need to be handled here for 256/512-bit results.
25344 if (Subtarget.hasInt256()) {
25345 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
25346
25347 if (InVT.getVectorNumElements() != NumElts)
25348 return DAG.getNode(Op.getOpcode(), dl, VT, In);
25349
25350 // FIXME: Apparently we create inreg operations that could be regular
25351 // extends.
25352 unsigned ExtOpc =
25355 return DAG.getNode(ExtOpc, dl, VT, In);
25356 }
25357
25358 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
25359 if (Subtarget.hasAVX()) {
25360 assert(VT.is256BitVector() && "256-bit vector expected");
25361 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25362 int HalfNumElts = HalfVT.getVectorNumElements();
25363
25364 unsigned NumSrcElts = InVT.getVectorNumElements();
25365 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
25366 for (int i = 0; i != HalfNumElts; ++i)
25367 HiMask[i] = HalfNumElts + i;
25368
25369 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
25370 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
25371 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
25372 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
25373 }
25374
25375 // We should only get here for sign extend.
25376 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
25377 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
25378 unsigned InNumElts = InVT.getVectorNumElements();
25379
25380 // If the source elements are already all-signbits, we don't need to extend,
25381 // just splat the elements.
25382 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
25383 if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
25384 unsigned Scale = InNumElts / NumElts;
25385 SmallVector<int, 16> ShuffleMask;
25386 for (unsigned I = 0; I != NumElts; ++I)
25387 ShuffleMask.append(Scale, I);
25388 return DAG.getBitcast(VT,
25389 DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
25390 }
25391
25392 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
25393 SDValue Curr = In;
25394 SDValue SignExt = Curr;
25395
25396 // As SRAI is only available on i16/i32 types, we expand only up to i32
25397 // and handle i64 separately.
25398 if (InVT != MVT::v4i32) {
25399 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
25400
25401 unsigned DestWidth = DestVT.getScalarSizeInBits();
25402 unsigned Scale = DestWidth / InSVT.getSizeInBits();
25403 unsigned DestElts = DestVT.getVectorNumElements();
25404
25405 // Build a shuffle mask that takes each input element and places it in the
25406 // MSBs of the new element size.
25407 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
25408 for (unsigned i = 0; i != DestElts; ++i)
25409 Mask[i * Scale + (Scale - 1)] = i;
25410
25411 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
25412 Curr = DAG.getBitcast(DestVT, Curr);
25413
25414 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
25415 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
25416 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
25417 }
25418
25419 if (VT == MVT::v2i64) {
25420 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
25421 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
25422 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
25423 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
25424 SignExt = DAG.getBitcast(VT, SignExt);
25425 }
25426
25427 return SignExt;
25428}
25429
25431 SelectionDAG &DAG) {
25432 MVT VT = Op->getSimpleValueType(0);
25433 SDValue In = Op->getOperand(0);
25434 MVT InVT = In.getSimpleValueType();
25435 SDLoc dl(Op);
25436
25437 if (InVT.getVectorElementType() == MVT::i1)
25438 return LowerSIGN_EXTEND_Mask(Op, dl, Subtarget, DAG);
25439
25440 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
25442 "Expected same number of elements");
25443 assert((VT.getVectorElementType() == MVT::i16 ||
25444 VT.getVectorElementType() == MVT::i32 ||
25445 VT.getVectorElementType() == MVT::i64) &&
25446 "Unexpected element type");
25447 assert((InVT.getVectorElementType() == MVT::i8 ||
25448 InVT.getVectorElementType() == MVT::i16 ||
25449 InVT.getVectorElementType() == MVT::i32) &&
25450 "Unexpected element type");
25451
25452 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
25453 assert(InVT == MVT::v32i8 && "Unexpected VT!");
25454 return splitVectorIntUnary(Op, DAG, dl);
25455 }
25456
25457 if (Subtarget.hasInt256())
25458 return Op;
25459
25460 // Optimize vectors in AVX mode
25461 // Sign extend v8i16 to v8i32 and
25462 // v4i32 to v4i64
25463 //
25464 // Divide input vector into two parts
25465 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
25466 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
25467 // concat the vectors to original VT
25468 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25469 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
25470
25471 unsigned NumElems = InVT.getVectorNumElements();
25472 SmallVector<int,8> ShufMask(NumElems, -1);
25473 for (unsigned i = 0; i != NumElems/2; ++i)
25474 ShufMask[i] = i + NumElems/2;
25475
25476 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
25477 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
25478
25479 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
25480}
25481
25482/// Change a vector store into a pair of half-size vector stores.
25484 SDValue StoredVal = Store->getValue();
25485 assert((StoredVal.getValueType().is256BitVector() ||
25486 StoredVal.getValueType().is512BitVector()) &&
25487 "Expecting 256/512-bit op");
25488
25489 // Splitting volatile memory ops is not allowed unless the operation was not
25490 // legal to begin with. Assume the input store is legal (this transform is
25491 // only used for targets with AVX). Note: It is possible that we have an
25492 // illegal type like v2i128, and so we could allow splitting a volatile store
25493 // in that case if that is important.
25494 if (!Store->isSimple())
25495 return SDValue();
25496
25497 SDLoc DL(Store);
25498 SDValue Value0, Value1;
25499 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
25500 unsigned HalfOffset = Value0.getValueType().getStoreSize();
25501 SDValue Ptr0 = Store->getBasePtr();
25502 SDValue Ptr1 =
25503 DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL);
25504 SDValue Ch0 =
25505 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
25506 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25507 SDValue Ch1 =
25508 DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
25509 Store->getPointerInfo().getWithOffset(HalfOffset),
25510 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25511 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
25512}
25513
25514/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
25515/// type.
25517 SelectionDAG &DAG) {
25518 SDValue StoredVal = Store->getValue();
25519 assert(StoreVT.is128BitVector() &&
25520 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
25521 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
25522
25523 // Splitting volatile memory ops is not allowed unless the operation was not
25524 // legal to begin with. We are assuming the input op is legal (this transform
25525 // is only used for targets with AVX).
25526 if (!Store->isSimple())
25527 return SDValue();
25528
25529 MVT StoreSVT = StoreVT.getScalarType();
25530 unsigned NumElems = StoreVT.getVectorNumElements();
25531 unsigned ScalarSize = StoreSVT.getStoreSize();
25532
25533 SDLoc DL(Store);
25535 for (unsigned i = 0; i != NumElems; ++i) {
25536 unsigned Offset = i * ScalarSize;
25537 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
25539 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
25540 DAG.getVectorIdxConstant(i, DL));
25541 SDValue Ch =
25542 DAG.getStore(Store->getChain(), DL, Scl, Ptr,
25543 Store->getPointerInfo().getWithOffset(Offset),
25544 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25545 Stores.push_back(Ch);
25546 }
25547 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
25548}
25549
25550static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
25551 SelectionDAG &DAG) {
25552 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
25553 SDLoc dl(St);
25554 SDValue StoredVal = St->getValue();
25555
25556 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
25557 if (StoredVal.getValueType().isVector() &&
25558 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
25559 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
25560 assert(NumElts <= 8 && "Unexpected VT");
25561 assert(!St->isTruncatingStore() && "Expected non-truncating store");
25562 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25563 "Expected AVX512F without AVX512DQI");
25564
25565 // We must pad with zeros to ensure we store zeroes to any unused bits.
25566 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25567 DAG.getUNDEF(MVT::v16i1), StoredVal,
25568 DAG.getVectorIdxConstant(0, dl));
25569 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
25570 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
25571 // Make sure we store zeros in the extra bits.
25572 if (NumElts < 8)
25573 StoredVal = DAG.getZeroExtendInReg(
25574 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
25575
25576 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25577 St->getPointerInfo(), St->getBaseAlign(),
25578 St->getMemOperand()->getFlags());
25579 }
25580
25581 if (St->isTruncatingStore())
25582 return SDValue();
25583
25584 // If this is a 256/512-bit store of concatenated ops, we are better off
25585 // splitting that store into two half-size stores. This avoids spurious use of
25586 // concatenated ops and each half can execute independently. Some cores would
25587 // split the op into halves anyway, so the concat is purely an extra op.
25588 MVT StoreVT = StoredVal.getSimpleValueType();
25589 if (StoreVT.is256BitVector() || StoreVT.is512BitVector()) {
25590 if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal, DAG))
25591 return splitVectorStore(St, DAG);
25592 return SDValue();
25593 }
25594
25595 if (StoreVT.is32BitVector())
25596 return SDValue();
25597
25598 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25599 assert(StoreVT.is64BitVector() && "Unexpected VT");
25600 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
25602 "Unexpected type action!");
25603
25604 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
25605 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
25606 DAG.getUNDEF(StoreVT));
25607
25608 if (Subtarget.hasSSE2()) {
25609 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
25610 // and store it.
25611 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
25612 MVT CastVT = MVT::getVectorVT(StVT, 2);
25613 StoredVal = DAG.getBitcast(CastVT, StoredVal);
25614 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
25615 DAG.getVectorIdxConstant(0, dl));
25616
25617 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25618 St->getPointerInfo(), St->getBaseAlign(),
25619 St->getMemOperand()->getFlags());
25620 }
25621 assert(Subtarget.hasSSE1() && "Expected SSE");
25622 SDVTList Tys = DAG.getVTList(MVT::Other);
25623 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
25624 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
25625 St->getMemOperand());
25626}
25627
25628// Lower vector extended loads using a shuffle. If SSSE3 is not available we
25629// may emit an illegal shuffle but the expansion is still better than scalar
25630// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
25631// we'll emit a shuffle and a arithmetic shift.
25632// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
25633// TODO: It is possible to support ZExt by zeroing the undef values during
25634// the shuffle phase or after the shuffle.
25635static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
25636 SelectionDAG &DAG) {
25637 MVT RegVT = Op.getSimpleValueType();
25638 assert(RegVT.isVector() && "We only custom lower vector loads.");
25639 assert(RegVT.isInteger() &&
25640 "We only custom lower integer vector loads.");
25641
25642 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
25643 SDLoc dl(Ld);
25644
25645 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
25646 if (RegVT.getVectorElementType() == MVT::i1) {
25647 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
25648 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
25649 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25650 "Expected AVX512F without AVX512DQI");
25651
25652 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
25653 Ld->getPointerInfo(), Ld->getBaseAlign(),
25654 Ld->getMemOperand()->getFlags());
25655
25656 // Replace chain users with the new chain.
25657 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
25658
25659 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
25660 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
25661 DAG.getBitcast(MVT::v16i1, Val),
25662 DAG.getVectorIdxConstant(0, dl));
25663 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
25664 }
25665
25666 return SDValue();
25667}
25668
25669/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
25670/// each of which has no other use apart from the AND / OR.
25671static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
25672 Opc = Op.getOpcode();
25673 if (Opc != ISD::OR && Opc != ISD::AND)
25674 return false;
25675 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
25676 Op.getOperand(0).hasOneUse() &&
25677 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
25678 Op.getOperand(1).hasOneUse());
25679}
25680
25681SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
25682 SDValue Chain = Op.getOperand(0);
25683 SDValue Cond = Op.getOperand(1);
25684 SDValue Dest = Op.getOperand(2);
25685 SDLoc dl(Op);
25686
25687 // Bail out when we don't have native compare instructions.
25688 if (Cond.getOpcode() == ISD::SETCC &&
25689 Cond.getOperand(0).getValueType() != MVT::f128 &&
25690 !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
25691 SDValue LHS = Cond.getOperand(0);
25692 SDValue RHS = Cond.getOperand(1);
25693 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
25694
25695 // Special case for
25696 // setcc([su]{add,sub,mul}o == 0)
25697 // setcc([su]{add,sub,mul}o != 1)
25699 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
25701 SDValue Value, Overflow;
25702 X86::CondCode X86Cond;
25703 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
25704
25705 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
25706 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
25707
25708 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25709 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25710 Overflow, Op->getFlags());
25711 }
25712
25713 if (LHS.getSimpleValueType().isInteger()) {
25714 SDValue CCVal;
25715 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
25716 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25717 EFLAGS, Op->getFlags());
25718 }
25719
25720 if (CC == ISD::SETOEQ) {
25721 // For FCMP_OEQ, we can emit
25722 // two branches instead of an explicit AND instruction with a
25723 // separate test. However, we only do this if this block doesn't
25724 // have a fall-through edge, because this requires an explicit
25725 // jmp when the condition is false.
25726 if (Op.getNode()->hasOneUse()) {
25727 SDNode *User = *Op.getNode()->user_begin();
25728 // Look for an unconditional branch following this conditional branch.
25729 // We need this because we need to reverse the successors in order
25730 // to implement FCMP_OEQ.
25731 if (User->getOpcode() == ISD::BR) {
25732 SDValue FalseBB = User->getOperand(1);
25733 SDNode *NewBR =
25734 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
25735 assert(NewBR == User);
25736 (void)NewBR;
25737 Dest = FalseBB;
25738
25739 SDValue Cmp =
25740 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25741 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25742 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
25743 CCVal, Cmp, Op->getFlags());
25744 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25745 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25746 Cmp, Op->getFlags());
25747 }
25748 }
25749 } else if (CC == ISD::SETUNE) {
25750 // For FCMP_UNE, we can emit
25751 // two branches instead of an explicit OR instruction with a
25752 // separate test.
25753 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25754 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25755 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25756 Cmp, Op->getFlags());
25757 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25758 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25759 Cmp, Op->getFlags());
25760 } else {
25761 X86::CondCode X86Cond =
25762 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
25763 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25764 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25765 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25766 Cmp, Op->getFlags());
25767 }
25768 }
25769
25771 SDValue Value, Overflow;
25772 X86::CondCode X86Cond;
25773 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25774
25775 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25776 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25777 Overflow, Op->getFlags());
25778 }
25779
25780 // Look past the truncate if the high bits are known zero.
25782 Cond = Cond.getOperand(0);
25783
25784 EVT CondVT = Cond.getValueType();
25785
25786 // Add an AND with 1 if we don't already have one.
25787 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
25788 Cond =
25789 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
25790
25791 SDValue LHS = Cond;
25792 SDValue RHS = DAG.getConstant(0, dl, CondVT);
25793
25794 SDValue CCVal;
25795 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
25796 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, EFLAGS,
25797 Op->getFlags());
25798}
25799
25800// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
25801// Calls to _alloca are needed to probe the stack when allocating more than 4k
25802// bytes in one go. Touching the stack at 4K increments is necessary to ensure
25803// that the guard pages used by the OS virtual memory manager are allocated in
25804// correct sequence.
25805SDValue
25806X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
25807 SelectionDAG &DAG) const {
25808 MachineFunction &MF = DAG.getMachineFunction();
25809 bool SplitStack = MF.shouldSplitStack();
25810 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
25811 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
25812 SplitStack || EmitStackProbeCall;
25813 SDLoc dl(Op);
25814
25815 // Get the inputs.
25816 SDNode *Node = Op.getNode();
25817 SDValue Chain = Op.getOperand(0);
25818 SDValue Size = Op.getOperand(1);
25819 MaybeAlign Alignment(Op.getConstantOperandVal(2));
25820 EVT VT = Node->getValueType(0);
25821
25822 // Chain the dynamic stack allocation so that it doesn't modify the stack
25823 // pointer when other instructions are using the stack.
25824 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
25825
25826 bool Is64Bit = Subtarget.is64Bit();
25827 MVT SPTy = Op.getValueType().getSimpleVT();
25828
25830 if (!Lower) {
25831 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25833 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
25834 " not tell us which reg is the stack pointer!");
25835
25836 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
25837 const Align StackAlign = TFI.getStackAlign();
25838 if (hasInlineStackProbe(MF)) {
25839 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, {SPTy, MVT::Other},
25840 {Chain, Size});
25841 Chain = Result.getValue(1);
25842 } else {
25843 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
25844 Chain = SP.getValue(1);
25845 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
25846 }
25847 if (Alignment && *Alignment > StackAlign)
25848 Result = DAG.getNode(
25849 ISD::AND, dl, VT, Result,
25850 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25851 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
25852 } else if (SplitStack) {
25853 if (Is64Bit) {
25854 // The 64 bit implementation of segmented stacks needs to clobber both r10
25855 // r11. This makes it impossible to use it along with nested parameters.
25856 const Function &F = MF.getFunction();
25857 for (const auto &A : F.args()) {
25858 if (A.hasNestAttr())
25859 report_fatal_error("Cannot use segmented stacks with functions that "
25860 "have nested arguments.");
25861 }
25862 }
25863
25864 Result =
25865 DAG.getNode(X86ISD::SEG_ALLOCA, dl, {SPTy, MVT::Other}, {Chain, Size});
25866 Chain = Result.getValue(1);
25867 } else {
25868 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
25869 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
25870 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
25871
25872 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25873 Register SPReg = RegInfo->getStackRegister();
25874 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
25875 Chain = SP.getValue(1);
25876
25877 if (Alignment) {
25878 SP = DAG.getNode(
25879 ISD::AND, dl, VT, SP.getValue(0),
25880 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25881 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
25882 }
25883
25884 Result = SP;
25885 }
25886
25887 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
25888
25889 SDValue Ops[2] = {Result, Chain};
25890 return DAG.getMergeValues(Ops, dl);
25891}
25892
25893SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
25894 MachineFunction &MF = DAG.getMachineFunction();
25895 SDValue Ptr = Op.getOperand(1);
25896 EVT PtrVT = Ptr.getValueType();
25897
25898 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
25899
25900 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25901 SDLoc DL(Op);
25902
25903 if (!Subtarget.is64Bit() ||
25904 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
25905 // vastart just stores the address of the VarArgsFrameIndex slot into the
25906 // memory location argument.
25907 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25908 return DAG.getStore(Op.getOperand(0), DL, FR, Ptr, MachinePointerInfo(SV));
25909 }
25910
25911 // __va_list_tag:
25912 // gp_offset (0 - 6 * 8)
25913 // fp_offset (48 - 48 + 8 * 16)
25914 // overflow_arg_area (point to parameters coming in memory).
25915 // reg_save_area
25917 SDValue FIN = Op.getOperand(1);
25918 // Store gp_offset
25919 SDValue Store = DAG.getStore(
25920 Op.getOperand(0), DL,
25921 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25922 MachinePointerInfo(SV));
25923 MemOps.push_back(Store);
25924
25925 // Store fp_offset
25926 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::getFixed(4), DL);
25927 Store = DAG.getStore(
25928 Op.getOperand(0), DL,
25929 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25930 MachinePointerInfo(SV, 4));
25931 MemOps.push_back(Store);
25932
25933 // Store ptr to overflow_arg_area
25934 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25935 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25936 Store =
25937 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25938 MemOps.push_back(Store);
25939
25940 // Store ptr to reg_save_area.
25941 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25942 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25943 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25944 Store = DAG.getStore(
25945 Op.getOperand(0), DL, RSFIN, FIN,
25946 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25947 MemOps.push_back(Store);
25948 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25949}
25950
25951SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25952 assert(Subtarget.is64Bit() &&
25953 "LowerVAARG only handles 64-bit va_arg!");
25954 assert(Op.getNumOperands() == 4);
25955
25956 MachineFunction &MF = DAG.getMachineFunction();
25957 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25958 // The Win64 ABI uses char* instead of a structure.
25959 return DAG.expandVAArg(Op.getNode());
25960
25961 SDValue Chain = Op.getOperand(0);
25962 SDValue SrcPtr = Op.getOperand(1);
25963 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25964 unsigned Align = Op.getConstantOperandVal(3);
25965 SDLoc dl(Op);
25966
25967 EVT ArgVT = Op.getNode()->getValueType(0);
25968 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25969 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25970 uint8_t ArgMode;
25971
25972 // Decide which area this value should be read from.
25973 // TODO: Implement the AMD64 ABI in its entirety. This simple
25974 // selection mechanism works only for the basic types.
25975 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
25976 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
25977 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
25978 } else {
25979 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
25980 "Unhandled argument type in LowerVAARG");
25981 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
25982 }
25983
25984 if (ArgMode == 2) {
25985 // Make sure using fp_offset makes sense.
25986 assert(!Subtarget.useSoftFloat() &&
25987 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
25988 Subtarget.hasSSE1());
25989 }
25990
25991 // Insert VAARG node into the DAG
25992 // VAARG returns two values: Variable Argument Address, Chain
25993 SDValue InstOps[] = {Chain, SrcPtr,
25994 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
25995 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
25996 DAG.getTargetConstant(Align, dl, MVT::i32)};
25997 SDVTList VTs = DAG.getVTList(SrcPtr.getValueType(), MVT::Other);
25998 SDValue VAARG = DAG.getMemIntrinsicNode(
25999 Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
26000 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
26001 /*Alignment=*/std::nullopt,
26003 Chain = VAARG.getValue(1);
26004
26005 // Load the next argument and return it
26006 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
26007}
26008
26009static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
26010 SelectionDAG &DAG) {
26011 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
26012 // where a va_list is still an i8*.
26013 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
26014 if (Subtarget.isCallingConvWin64(
26016 // Probably a Win64 va_copy.
26017 return DAG.expandVACopy(Op.getNode());
26018
26019 SDValue Chain = Op.getOperand(0);
26020 SDValue DstPtr = Op.getOperand(1);
26021 SDValue SrcPtr = Op.getOperand(2);
26022 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
26023 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
26024 SDLoc DL(Op);
26025
26026 return DAG.getMemcpy(
26027 Chain, DL, DstPtr, SrcPtr,
26028 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
26029 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
26030 /*CI=*/nullptr, std::nullopt, MachinePointerInfo(DstSV),
26031 MachinePointerInfo(SrcSV));
26032}
26033
26034// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
26035static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
26036 switch (Opc) {
26037 case ISD::SHL:
26038 case X86ISD::VSHL:
26039 case X86ISD::VSHLI:
26040 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
26041 case ISD::SRL:
26042 case X86ISD::VSRL:
26043 case X86ISD::VSRLI:
26044 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
26045 case ISD::SRA:
26046 case X86ISD::VSRA:
26047 case X86ISD::VSRAI:
26048 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
26049 }
26050 llvm_unreachable("Unknown target vector shift node");
26051}
26052
26053/// Handle vector element shifts where the shift amount is a constant.
26054/// Takes immediate version of shift as input.
26055static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
26056 SDValue SrcOp, uint64_t ShiftAmt,
26057 SelectionDAG &DAG) {
26058 MVT ElementType = VT.getVectorElementType();
26059
26060 // Bitcast the source vector to the output type, this is mainly necessary for
26061 // vXi8/vXi64 shifts.
26062 if (VT != SrcOp.getSimpleValueType())
26063 SrcOp = DAG.getBitcast(VT, SrcOp);
26064
26065 // Fold this packed shift into its first operand if ShiftAmt is 0.
26066 if (ShiftAmt == 0)
26067 return SrcOp;
26068
26069 // Check for ShiftAmt >= element width
26070 if (ShiftAmt >= ElementType.getSizeInBits()) {
26071 if (Opc == X86ISD::VSRAI)
26072 ShiftAmt = ElementType.getSizeInBits() - 1;
26073 else
26074 return DAG.getConstant(0, dl, VT);
26075 }
26076
26078 && "Unknown target vector shift-by-constant node");
26079
26080 // Fold this packed vector shift into a build vector if SrcOp is a
26081 // vector of Constants or UNDEFs.
26083 unsigned ShiftOpc;
26084 switch (Opc) {
26085 default: llvm_unreachable("Unknown opcode!");
26086 case X86ISD::VSHLI:
26087 ShiftOpc = ISD::SHL;
26088 break;
26089 case X86ISD::VSRLI:
26090 ShiftOpc = ISD::SRL;
26091 break;
26092 case X86ISD::VSRAI:
26093 ShiftOpc = ISD::SRA;
26094 break;
26095 }
26096
26097 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
26098 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
26099 return C;
26100 }
26101
26102 return DAG.getNode(Opc, dl, VT, SrcOp,
26103 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
26104}
26105
26106/// Handle vector element shifts by a splat shift amount
26107static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
26108 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
26109 const X86Subtarget &Subtarget,
26110 SelectionDAG &DAG) {
26111 MVT AmtVT = ShAmt.getSimpleValueType();
26112 assert(AmtVT.isVector() && "Vector shift type mismatch");
26113 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
26114 "Illegal vector splat index");
26115
26116 // Move the splat element to the bottom element.
26117 if (ShAmtIdx != 0) {
26118 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
26119 Mask[0] = ShAmtIdx;
26120 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
26121 }
26122
26123 // Peek through any zext node if we can get back to a 128-bit source.
26124 if (AmtVT.getScalarSizeInBits() == 64 &&
26125 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
26127 ShAmt.getOperand(0).getValueType().isSimple() &&
26128 ShAmt.getOperand(0).getValueType().is128BitVector()) {
26129 ShAmt = ShAmt.getOperand(0);
26130 AmtVT = ShAmt.getSimpleValueType();
26131 }
26132
26133 // See if we can mask off the upper elements using the existing source node.
26134 // The shift uses the entire lower 64-bits of the amount vector, so no need to
26135 // do this for vXi64 types.
26136 bool IsMasked = false;
26137 if (AmtVT.getScalarSizeInBits() < 64) {
26138 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
26139 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
26140 // If the shift amount has come from a scalar, then zero-extend the scalar
26141 // before moving to the vector.
26142 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
26143 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26144 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
26145 AmtVT = MVT::v4i32;
26146 IsMasked = true;
26147 } else if (ShAmt.getOpcode() == ISD::AND) {
26148 // See if the shift amount is already masked (e.g. for rotation modulo),
26149 // then we can zero-extend it by setting all the other mask elements to
26150 // zero.
26151 SmallVector<SDValue> MaskElts(
26152 AmtVT.getVectorNumElements(),
26153 DAG.getConstant(0, dl, AmtVT.getScalarType()));
26154 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
26155 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
26156 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
26157 {ShAmt.getOperand(1), Mask}))) {
26158 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
26159 IsMasked = true;
26160 }
26161 }
26162 }
26163
26164 // Extract if the shift amount vector is larger than 128-bits.
26165 if (AmtVT.getSizeInBits() > 128) {
26166 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
26167 AmtVT = ShAmt.getSimpleValueType();
26168 }
26169
26170 // Zero-extend bottom element to v2i64 vector type, either by extension or
26171 // shuffle masking.
26172 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
26173 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
26174 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
26175 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
26176 } else if (Subtarget.hasSSE41()) {
26177 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
26178 MVT::v2i64, ShAmt);
26179 } else {
26180 SDValue ByteShift = DAG.getTargetConstant(
26181 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
26182 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
26183 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26184 ByteShift);
26185 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26186 ByteShift);
26187 }
26188 }
26189
26190 // Change opcode to non-immediate version.
26192
26193 // The return type has to be a 128-bit type with the same element
26194 // type as the input type.
26195 MVT EltVT = VT.getVectorElementType();
26196 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
26197
26198 ShAmt = DAG.getBitcast(ShVT, ShAmt);
26199 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
26200}
26201
26202/// Return Mask with the necessary casting or extending
26203/// for \p Mask according to \p MaskVT when lowering masking intrinsics
26204static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
26205 const X86Subtarget &Subtarget, SelectionDAG &DAG,
26206 const SDLoc &dl) {
26207
26208 if (isAllOnesConstant(Mask))
26209 return DAG.getConstant(1, dl, MaskVT);
26210 if (X86::isZeroNode(Mask))
26211 return DAG.getConstant(0, dl, MaskVT);
26212
26213 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
26214
26215 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
26216 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
26217 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
26218 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
26219 SDValue Lo, Hi;
26220 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
26221 Lo = DAG.getBitcast(MVT::v32i1, Lo);
26222 Hi = DAG.getBitcast(MVT::v32i1, Hi);
26223 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
26224 } else {
26225 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
26226 Mask.getSimpleValueType().getSizeInBits());
26227 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
26228 // are extracted by EXTRACT_SUBVECTOR.
26229 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
26230 DAG.getBitcast(BitcastVT, Mask),
26231 DAG.getVectorIdxConstant(0, dl));
26232 }
26233}
26234
26235/// Return (and \p Op, \p Mask) for compare instructions or
26236/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
26237/// necessary casting or extending for \p Mask when lowering masking intrinsics
26239 SDValue PreservedSrc,
26240 const X86Subtarget &Subtarget,
26241 SelectionDAG &DAG) {
26242 MVT VT = Op.getSimpleValueType();
26243 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
26244 unsigned OpcodeSelect = ISD::VSELECT;
26245 SDLoc dl(Op);
26246
26247 if (isAllOnesConstant(Mask))
26248 return Op;
26249
26250 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26251
26252 if (PreservedSrc.isUndef())
26253 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26254 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
26255}
26256
26257/// Creates an SDNode for a predicated scalar operation.
26258/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
26259/// The mask is coming as MVT::i8 and it should be transformed
26260/// to MVT::v1i1 while lowering masking intrinsics.
26261/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
26262/// "X86select" instead of "vselect". We just can't create the "vselect" node
26263/// for a scalar instruction.
26265 SDValue PreservedSrc,
26266 const X86Subtarget &Subtarget,
26267 SelectionDAG &DAG) {
26268 auto *MaskConst = dyn_cast<ConstantSDNode>(Mask);
26269 if (MaskConst && (MaskConst->getZExtValue() & 0x1))
26270 return Op;
26271
26272 MVT VT = Op.getSimpleValueType();
26273 SDLoc dl(Op);
26274
26275 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
26276 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
26277 DAG.getBitcast(MVT::v8i1, Mask),
26278 DAG.getVectorIdxConstant(0, dl));
26279 if (Op.getOpcode() == X86ISD::FSETCCM ||
26280 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
26281 Op.getOpcode() == X86ISD::VFPCLASSS)
26282 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
26283
26284 if (PreservedSrc.isUndef())
26285 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26286
26287 if (MaskConst) {
26288 assert((MaskConst->getZExtValue() & 0x1) == 0 && "Expected false mask");
26289 // Discard op and blend passthrough with scalar op src/dst.
26291 std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
26292 ShuffleMask[0] = VT.getVectorNumElements();
26293 return DAG.getVectorShuffle(VT, dl, Op.getOperand(0), PreservedSrc,
26294 ShuffleMask);
26295 }
26296
26297 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
26298}
26299
26301 if (!Fn->hasPersonalityFn())
26303 "querying registration node size for function without personality");
26304 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
26305 // WinEHStatePass for the full struct definition.
26306 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
26307 case EHPersonality::MSVC_X86SEH: return 24;
26308 case EHPersonality::MSVC_CXX: return 16;
26309 default: break;
26310 }
26312 "can only recover FP for 32-bit MSVC EH personality functions");
26313}
26314
26315/// When the MSVC runtime transfers control to us, either to an outlined
26316/// function or when returning to a parent frame after catching an exception, we
26317/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
26318/// Here's the math:
26319/// RegNodeBase = EntryEBP - RegNodeSize
26320/// ParentFP = RegNodeBase - ParentFrameOffset
26321/// Subtracting RegNodeSize takes us to the offset of the registration node, and
26322/// subtracting the offset (negative on x86) takes us back to the parent FP.
26324 SDValue EntryEBP) {
26326 SDLoc dl;
26327
26328 // It's possible that the parent function no longer has a personality function
26329 // if the exceptional code was optimized away, in which case we just return
26330 // the incoming EBP.
26331 if (!Fn->hasPersonalityFn())
26332 return EntryEBP;
26333
26334 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
26335 // registration, or the .set_setframe offset.
26338 MVT PtrVT = EntryEBP.getValueType().getSimpleVT();
26339 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
26340 SDValue ParentFrameOffset =
26341 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
26342
26343 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
26344 // prologue to RBP in the parent function.
26345 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
26346 if (Subtarget.is64Bit())
26347 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
26348
26349 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
26350 // RegNodeBase = EntryEBP - RegNodeSize
26351 // ParentFP = RegNodeBase - ParentFrameOffset
26352 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
26353 DAG.getConstant(RegNodeSize, dl, PtrVT));
26354 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
26355}
26356
26357SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
26358 SelectionDAG &DAG) const {
26359 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
26360 auto isRoundModeCurDirection = [](SDValue Rnd) {
26361 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
26362 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
26363
26364 return false;
26365 };
26366 auto isRoundModeSAE = [](SDValue Rnd) {
26367 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26368 unsigned RC = C->getZExtValue();
26370 // Clear the NO_EXC bit and check remaining bits.
26372 // As a convenience we allow no other bits or explicitly
26373 // current direction.
26374 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
26375 }
26376 }
26377
26378 return false;
26379 };
26380 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
26381 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26382 RC = C->getZExtValue();
26384 // Clear the NO_EXC bit and check remaining bits.
26390 }
26391 }
26392
26393 return false;
26394 };
26395
26396 SDLoc dl(Op);
26397 unsigned IntNo = Op.getConstantOperandVal(0);
26398 MVT VT = Op.getSimpleValueType();
26399 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
26400
26401 // Propagate flags from original node to transformed node(s).
26402 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
26403
26404 if (IntrData) {
26405 switch(IntrData->Type) {
26406 case INTR_TYPE_1OP: {
26407 // We specify 2 possible opcodes for intrinsics with rounding modes.
26408 // First, we check if the intrinsic may have non-default rounding mode,
26409 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26410 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26411 if (IntrWithRoundingModeOpcode != 0) {
26412 SDValue Rnd = Op.getOperand(2);
26413 unsigned RC = 0;
26414 if (isRoundModeSAEToX(Rnd, RC))
26415 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26416 Op.getOperand(1),
26417 DAG.getTargetConstant(RC, dl, MVT::i32));
26418 if (!isRoundModeCurDirection(Rnd))
26419 return SDValue();
26420 }
26421 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26422 Op.getOperand(1));
26423 }
26424 case INTR_TYPE_1OP_SAE: {
26425 SDValue Sae = Op.getOperand(2);
26426
26427 unsigned Opc;
26428 if (isRoundModeCurDirection(Sae))
26429 Opc = IntrData->Opc0;
26430 else if (isRoundModeSAE(Sae))
26431 Opc = IntrData->Opc1;
26432 else
26433 return SDValue();
26434
26435 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
26436 }
26437 case INTR_TYPE_2OP: {
26438 SDValue Src2 = Op.getOperand(2);
26439
26440 // We specify 2 possible opcodes for intrinsics with rounding modes.
26441 // First, we check if the intrinsic may have non-default rounding mode,
26442 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26443 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26444 if (IntrWithRoundingModeOpcode != 0) {
26445 SDValue Rnd = Op.getOperand(3);
26446 unsigned RC = 0;
26447 if (isRoundModeSAEToX(Rnd, RC))
26448 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26449 Op.getOperand(1), Src2,
26450 DAG.getTargetConstant(RC, dl, MVT::i32));
26451 if (!isRoundModeCurDirection(Rnd))
26452 return SDValue();
26453 }
26454
26455 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26456 Op.getOperand(1), Src2);
26457 }
26458 case INTR_TYPE_2OP_SAE: {
26459 SDValue Sae = Op.getOperand(3);
26460
26461 unsigned Opc;
26462 if (isRoundModeCurDirection(Sae))
26463 Opc = IntrData->Opc0;
26464 else if (isRoundModeSAE(Sae))
26465 Opc = IntrData->Opc1;
26466 else
26467 return SDValue();
26468
26469 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
26470 Op.getOperand(2));
26471 }
26472 case INTR_TYPE_3OP:
26473 case INTR_TYPE_3OP_IMM8: {
26474 SDValue Src1 = Op.getOperand(1);
26475 SDValue Src2 = Op.getOperand(2);
26476 SDValue Src3 = Op.getOperand(3);
26477
26478 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
26479 Src3.getValueType() != MVT::i8) {
26480 Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
26481 }
26482
26483 // We specify 2 possible opcodes for intrinsics with rounding modes.
26484 // First, we check if the intrinsic may have non-default rounding mode,
26485 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26486 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26487 if (IntrWithRoundingModeOpcode != 0) {
26488 SDValue Rnd = Op.getOperand(4);
26489 unsigned RC = 0;
26490 if (isRoundModeSAEToX(Rnd, RC))
26491 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26492 Src1, Src2, Src3,
26493 DAG.getTargetConstant(RC, dl, MVT::i32));
26494 if (!isRoundModeCurDirection(Rnd))
26495 return SDValue();
26496 }
26497
26498 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26499 {Src1, Src2, Src3});
26500 }
26501 case INTR_TYPE_4OP_IMM8: {
26502 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
26503 SDValue Src4 = Op.getOperand(4);
26504 if (Src4.getValueType() != MVT::i8) {
26505 Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
26506 }
26507
26508 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26509 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
26510 Src4);
26511 }
26512 case INTR_TYPE_1OP_MASK: {
26513 SDValue Src = Op.getOperand(1);
26514 SDValue PassThru = Op.getOperand(2);
26515 SDValue Mask = Op.getOperand(3);
26516 // We add rounding mode to the Node when
26517 // - RC Opcode is specified and
26518 // - RC is not "current direction".
26519 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26520 if (IntrWithRoundingModeOpcode != 0) {
26521 SDValue Rnd = Op.getOperand(4);
26522 unsigned RC = 0;
26523 if (isRoundModeSAEToX(Rnd, RC))
26524 return getVectorMaskingNode(
26525 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26526 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
26527 Mask, PassThru, Subtarget, DAG);
26528 if (!isRoundModeCurDirection(Rnd))
26529 return SDValue();
26530 }
26531 return getVectorMaskingNode(
26532 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
26533 Subtarget, DAG);
26534 }
26536 SDValue Src = Op.getOperand(1);
26537 SDValue PassThru = Op.getOperand(2);
26538 SDValue Mask = Op.getOperand(3);
26539 SDValue Rnd = Op.getOperand(4);
26540
26541 unsigned Opc;
26542 if (isRoundModeCurDirection(Rnd))
26543 Opc = IntrData->Opc0;
26544 else if (isRoundModeSAE(Rnd))
26545 Opc = IntrData->Opc1;
26546 else
26547 return SDValue();
26548
26549 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
26550 Subtarget, DAG);
26551 }
26552 case INTR_TYPE_SCALAR_MASK: {
26553 SDValue Src1 = Op.getOperand(1);
26554 SDValue Src2 = Op.getOperand(2);
26555 SDValue passThru = Op.getOperand(3);
26556 SDValue Mask = Op.getOperand(4);
26557 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26558 // There are 2 kinds of intrinsics in this group:
26559 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
26560 // (2) With rounding mode and sae - 7 operands.
26561 bool HasRounding = IntrWithRoundingModeOpcode != 0;
26562 if (Op.getNumOperands() == (5U + HasRounding)) {
26563 if (HasRounding) {
26564 SDValue Rnd = Op.getOperand(5);
26565 unsigned RC = 0;
26566 if (isRoundModeSAEToX(Rnd, RC))
26567 return getScalarMaskingNode(
26568 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
26569 DAG.getTargetConstant(RC, dl, MVT::i32)),
26570 Mask, passThru, Subtarget, DAG);
26571 if (!isRoundModeCurDirection(Rnd))
26572 return SDValue();
26573 }
26574 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
26575 Src2),
26576 Mask, passThru, Subtarget, DAG);
26577 }
26578
26579 assert(Op.getNumOperands() == (6U + HasRounding) &&
26580 "Unexpected intrinsic form");
26581 SDValue RoundingMode = Op.getOperand(5);
26582 unsigned Opc = IntrData->Opc0;
26583 if (HasRounding) {
26584 SDValue Sae = Op.getOperand(6);
26585 if (isRoundModeSAE(Sae))
26586 Opc = IntrWithRoundingModeOpcode;
26587 else if (!isRoundModeCurDirection(Sae))
26588 return SDValue();
26589 }
26590 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
26591 Src2, RoundingMode),
26592 Mask, passThru, Subtarget, DAG);
26593 }
26595 SDValue Src1 = Op.getOperand(1);
26596 SDValue Src2 = Op.getOperand(2);
26597 SDValue passThru = Op.getOperand(3);
26598 SDValue Mask = Op.getOperand(4);
26599 SDValue Rnd = Op.getOperand(5);
26600
26601 SDValue NewOp;
26602 unsigned RC = 0;
26603 if (isRoundModeCurDirection(Rnd))
26604 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26605 else if (isRoundModeSAEToX(Rnd, RC))
26606 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26607 DAG.getTargetConstant(RC, dl, MVT::i32));
26608 else
26609 return SDValue();
26610
26611 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
26612 }
26614 SDValue Src1 = Op.getOperand(1);
26615 SDValue Src2 = Op.getOperand(2);
26616 SDValue passThru = Op.getOperand(3);
26617 SDValue Mask = Op.getOperand(4);
26618 SDValue Sae = Op.getOperand(5);
26619 unsigned Opc;
26620 if (isRoundModeCurDirection(Sae))
26621 Opc = IntrData->Opc0;
26622 else if (isRoundModeSAE(Sae))
26623 Opc = IntrData->Opc1;
26624 else
26625 return SDValue();
26626
26627 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26628 Mask, passThru, Subtarget, DAG);
26629 }
26630 case INTR_TYPE_2OP_MASK: {
26631 SDValue Src1 = Op.getOperand(1);
26632 SDValue Src2 = Op.getOperand(2);
26633 SDValue PassThru = Op.getOperand(3);
26634 SDValue Mask = Op.getOperand(4);
26635 SDValue NewOp;
26636 if (IntrData->Opc1 != 0) {
26637 SDValue Rnd = Op.getOperand(5);
26638 unsigned RC = 0;
26639 if (isRoundModeSAEToX(Rnd, RC))
26640 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26641 DAG.getTargetConstant(RC, dl, MVT::i32));
26642 else if (!isRoundModeCurDirection(Rnd))
26643 return SDValue();
26644 }
26645 if (!NewOp)
26646 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26647 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26648 }
26650 SDValue Src1 = Op.getOperand(1);
26651 SDValue Src2 = Op.getOperand(2);
26652 SDValue PassThru = Op.getOperand(3);
26653 SDValue Mask = Op.getOperand(4);
26654
26655 unsigned Opc = IntrData->Opc0;
26656 if (IntrData->Opc1 != 0) {
26657 SDValue Sae = Op.getOperand(5);
26658 if (isRoundModeSAE(Sae))
26659 Opc = IntrData->Opc1;
26660 else if (!isRoundModeCurDirection(Sae))
26661 return SDValue();
26662 }
26663
26664 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26665 Mask, PassThru, Subtarget, DAG);
26666 }
26668 SDValue Src1 = Op.getOperand(1);
26669 SDValue Src2 = Op.getOperand(2);
26670 SDValue Src3 = Op.getOperand(3);
26671 SDValue PassThru = Op.getOperand(4);
26672 SDValue Mask = Op.getOperand(5);
26673 SDValue Sae = Op.getOperand(6);
26674 unsigned Opc;
26675 if (isRoundModeCurDirection(Sae))
26676 Opc = IntrData->Opc0;
26677 else if (isRoundModeSAE(Sae))
26678 Opc = IntrData->Opc1;
26679 else
26680 return SDValue();
26681
26682 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26683 Mask, PassThru, Subtarget, DAG);
26684 }
26686 SDValue Src1 = Op.getOperand(1);
26687 SDValue Src2 = Op.getOperand(2);
26688 SDValue Src3 = Op.getOperand(3);
26689 SDValue PassThru = Op.getOperand(4);
26690 SDValue Mask = Op.getOperand(5);
26691
26692 unsigned Opc = IntrData->Opc0;
26693 if (IntrData->Opc1 != 0) {
26694 SDValue Sae = Op.getOperand(6);
26695 if (isRoundModeSAE(Sae))
26696 Opc = IntrData->Opc1;
26697 else if (!isRoundModeCurDirection(Sae))
26698 return SDValue();
26699 }
26700 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26701 Mask, PassThru, Subtarget, DAG);
26702 }
26703 case BLENDV: {
26704 SDValue Src1 = Op.getOperand(1);
26705 SDValue Src2 = Op.getOperand(2);
26706 SDValue Src3 = Op.getOperand(3);
26707
26708 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
26709 Src3 = DAG.getBitcast(MaskVT, Src3);
26710
26711 // Reverse the operands to match VSELECT order.
26712 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
26713 }
26714 case VPERM_2OP : {
26715 SDValue Src1 = Op.getOperand(1);
26716 SDValue Src2 = Op.getOperand(2);
26717
26718 // Swap Src1 and Src2 in the node creation
26719 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
26720 }
26721 case CFMA_OP_MASKZ:
26722 case CFMA_OP_MASK: {
26723 SDValue Src1 = Op.getOperand(1);
26724 SDValue Src2 = Op.getOperand(2);
26725 SDValue Src3 = Op.getOperand(3);
26726 SDValue Mask = Op.getOperand(4);
26727 MVT VT = Op.getSimpleValueType();
26728
26729 SDValue PassThru = Src3;
26730 if (IntrData->Type == CFMA_OP_MASKZ)
26731 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26732
26733 // We add rounding mode to the Node when
26734 // - RC Opcode is specified and
26735 // - RC is not "current direction".
26736 SDValue NewOp;
26737 if (IntrData->Opc1 != 0) {
26738 SDValue Rnd = Op.getOperand(5);
26739 unsigned RC = 0;
26740 if (isRoundModeSAEToX(Rnd, RC))
26741 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
26742 DAG.getTargetConstant(RC, dl, MVT::i32));
26743 else if (!isRoundModeCurDirection(Rnd))
26744 return SDValue();
26745 }
26746 if (!NewOp)
26747 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
26748 if (IntrData->Opc0 == X86ISD::VFMADDCSH ||
26749 IntrData->Opc0 == X86ISD::VFCMADDCSH)
26750 return getScalarMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26751 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26752 }
26753 case IFMA_OP:
26754 // NOTE: We need to swizzle the operands to pass the multiply operands
26755 // first.
26756 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26757 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
26758 case FPCLASSS: {
26759 SDValue Src1 = Op.getOperand(1);
26760 SDValue Imm = Op.getOperand(2);
26761 SDValue Mask = Op.getOperand(3);
26762 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
26763 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
26764 Subtarget, DAG);
26765 // Need to fill with zeros to ensure the bitcast will produce zeroes
26766 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26767 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26768 DAG.getConstant(0, dl, MVT::v8i1), FPclassMask,
26769 DAG.getVectorIdxConstant(0, dl));
26770 return DAG.getBitcast(MVT::i8, Ins);
26771 }
26772
26773 case CMP_MASK_CC: {
26774 MVT MaskVT = Op.getSimpleValueType();
26775 SDValue CC = Op.getOperand(3);
26776 SDValue Mask = Op.getOperand(4);
26777 // We specify 2 possible opcodes for intrinsics with rounding modes.
26778 // First, we check if the intrinsic may have non-default rounding mode,
26779 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26780 if (IntrData->Opc1 != 0) {
26781 SDValue Sae = Op.getOperand(5);
26782 if (isRoundModeSAE(Sae))
26783 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
26784 Op.getOperand(2), CC, Mask, Sae);
26785 if (!isRoundModeCurDirection(Sae))
26786 return SDValue();
26787 }
26788 //default rounding mode
26789 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
26790 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
26791 }
26792 case CMP_MASK_SCALAR_CC: {
26793 SDValue Src1 = Op.getOperand(1);
26794 SDValue Src2 = Op.getOperand(2);
26795 SDValue CC = Op.getOperand(3);
26796 SDValue Mask = Op.getOperand(4);
26797
26798 SDValue Cmp;
26799 if (IntrData->Opc1 != 0) {
26800 SDValue Sae = Op.getOperand(5);
26801 if (isRoundModeSAE(Sae))
26802 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
26803 else if (!isRoundModeCurDirection(Sae))
26804 return SDValue();
26805 }
26806 //default rounding mode
26807 if (!Cmp.getNode())
26808 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
26809
26810 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
26811 Subtarget, DAG);
26812 // Need to fill with zeros to ensure the bitcast will produce zeroes
26813 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26814 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26815 DAG.getConstant(0, dl, MVT::v8i1), CmpMask,
26816 DAG.getVectorIdxConstant(0, dl));
26817 return DAG.getBitcast(MVT::i8, Ins);
26818 }
26819 case COMI: { // Comparison intrinsics
26820 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
26821 SDValue LHS = Op.getOperand(1);
26822 SDValue RHS = Op.getOperand(2);
26823 // Some conditions require the operands to be swapped.
26824 if (CC == ISD::SETLT || CC == ISD::SETLE)
26825 std::swap(LHS, RHS);
26826
26827 // For AVX10.2, Support EQ and NE.
26828 bool HasAVX10_2_COMX =
26829 Subtarget.hasAVX10_2() && (CC == ISD::SETEQ || CC == ISD::SETNE);
26830
26831 // AVX10.2 COMPARE supports only v2f64, v4f32 or v8f16.
26832 // For BF type we need to fall back.
26833 bool HasAVX10_2_COMX_Ty = (LHS.getSimpleValueType() != MVT::v8bf16);
26834
26835 auto ComiOpCode = IntrData->Opc0;
26836 auto isUnordered = (ComiOpCode == X86ISD::UCOMI);
26837
26838 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty)
26839 ComiOpCode = isUnordered ? X86ISD::UCOMX : X86ISD::COMX;
26840
26841 SDValue Comi = DAG.getNode(ComiOpCode, dl, MVT::i32, LHS, RHS);
26842
26843 SDValue SetCC;
26844 switch (CC) {
26845 case ISD::SETEQ: {
26846 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
26847 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 1
26848 break;
26849 // (ZF = 1 and PF = 0)
26850 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
26851 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
26852 break;
26853 }
26854 case ISD::SETNE: {
26855 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
26856 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 0
26857 break;
26858 // (ZF = 0 or PF = 1)
26859 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
26860 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
26861 break;
26862 }
26863 case ISD::SETGT: // (CF = 0 and ZF = 0)
26864 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
26865 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
26866 break;
26867 }
26868 case ISD::SETGE: // CF = 0
26869 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
26870 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
26871 break;
26872 default:
26873 llvm_unreachable("Unexpected illegal condition!");
26874 }
26875 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26876 }
26877 case COMI_RM: { // Comparison intrinsics with Sae
26878 SDValue LHS = Op.getOperand(1);
26879 SDValue RHS = Op.getOperand(2);
26880 unsigned CondVal = Op.getConstantOperandVal(3);
26881 SDValue Sae = Op.getOperand(4);
26882
26883 SDValue FCmp;
26884 if (isRoundModeCurDirection(Sae))
26885 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
26886 DAG.getTargetConstant(CondVal, dl, MVT::i8));
26887 else if (isRoundModeSAE(Sae))
26888 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
26889 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
26890 else
26891 return SDValue();
26892 // Need to fill with zeros to ensure the bitcast will produce zeroes
26893 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26894 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26895 DAG.getConstant(0, dl, MVT::v16i1), FCmp,
26896 DAG.getVectorIdxConstant(0, dl));
26897 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
26898 DAG.getBitcast(MVT::i16, Ins));
26899 }
26900 case VSHIFT: {
26901 SDValue SrcOp = Op.getOperand(1);
26902 SDValue ShAmt = Op.getOperand(2);
26903 assert(ShAmt.getValueType() == MVT::i32 &&
26904 "Unexpected VSHIFT amount type");
26905
26906 // Catch shift-by-constant.
26907 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
26908 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
26909 Op.getSimpleValueType(), SrcOp,
26910 CShAmt->getZExtValue(), DAG);
26911
26912 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26913 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26914 SrcOp, ShAmt, 0, Subtarget, DAG);
26915 }
26917 SDValue Mask = Op.getOperand(3);
26918 SDValue DataToCompress = Op.getOperand(1);
26919 SDValue PassThru = Op.getOperand(2);
26920 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
26921 return Op.getOperand(1);
26922
26923 // Avoid false dependency.
26924 if (PassThru.isUndef())
26925 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26926
26927 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26928 Mask);
26929 }
26930 case FIXUPIMM:
26931 case FIXUPIMM_MASKZ: {
26932 SDValue Src1 = Op.getOperand(1);
26933 SDValue Src2 = Op.getOperand(2);
26934 SDValue Src3 = Op.getOperand(3);
26935 SDValue Imm = Op.getOperand(4);
26936 SDValue Mask = Op.getOperand(5);
26937 SDValue Passthru = (IntrData->Type == FIXUPIMM)
26938 ? Src1
26939 : getZeroVector(VT, Subtarget, DAG, dl);
26940
26941 unsigned Opc = IntrData->Opc0;
26942 if (IntrData->Opc1 != 0) {
26943 SDValue Sae = Op.getOperand(6);
26944 if (isRoundModeSAE(Sae))
26945 Opc = IntrData->Opc1;
26946 else if (!isRoundModeCurDirection(Sae))
26947 return SDValue();
26948 }
26949
26950 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
26951
26953 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26954
26955 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26956 }
26957 case ROUNDP: {
26958 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26959 // Clear the upper bits of the rounding immediate so that the legacy
26960 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26961 uint64_t Round = Op.getConstantOperandVal(2);
26962 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26963 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26964 Op.getOperand(1), RoundingMode);
26965 }
26966 case ROUNDS: {
26967 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
26968 // Clear the upper bits of the rounding immediate so that the legacy
26969 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26970 uint64_t Round = Op.getConstantOperandVal(3);
26971 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26972 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26973 Op.getOperand(1), Op.getOperand(2), RoundingMode);
26974 }
26975 case BEXTRI: {
26976 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
26977
26978 uint64_t Imm = Op.getConstantOperandVal(2);
26979 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
26980 Op.getValueType());
26981 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26982 Op.getOperand(1), Control);
26983 }
26984 // ADC/SBB
26985 case ADX: {
26986 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
26987 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
26988
26989 SDValue Res;
26990 // If the carry in is zero, then we should just use ADD/SUB instead of
26991 // ADC/SBB.
26992 if (isNullConstant(Op.getOperand(1))) {
26993 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
26994 Op.getOperand(3));
26995 } else {
26996 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
26997 DAG.getAllOnesConstant(dl, MVT::i8));
26998 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
26999 Op.getOperand(3), GenCF.getValue(1));
27000 }
27001 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
27002 SDValue Results[] = { SetCC, Res };
27003 return DAG.getMergeValues(Results, dl);
27004 }
27005 case CVTPD2PS_MASK:
27006 case CVTPD2DQ_MASK:
27007 case CVTQQ2PS_MASK:
27008 case TRUNCATE_TO_REG: {
27009 SDValue Src = Op.getOperand(1);
27010 SDValue PassThru = Op.getOperand(2);
27011 SDValue Mask = Op.getOperand(3);
27012
27013 if (isAllOnesConstant(Mask))
27014 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27015
27016 MVT SrcVT = Src.getSimpleValueType();
27017 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27018 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27019 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27020 {Src, PassThru, Mask});
27021 }
27022 case TRUNCATE2_TO_REG: {
27023 SDValue Src = Op.getOperand(1);
27024 SDValue Src2 = Op.getOperand(2);
27025 SDValue PassThru = Op.getOperand(3);
27026 SDValue Mask = Op.getOperand(4);
27027
27028 if (isAllOnesConstant(Mask))
27029 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), {Src, Src2});
27030
27031 MVT Src2VT = Src2.getSimpleValueType();
27032 MVT MaskVT = MVT::getVectorVT(MVT::i1, Src2VT.getVectorNumElements());
27033 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27034 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27035 {Src, Src2, PassThru, Mask});
27036 }
27037 case CVTPS2PH_MASK: {
27038 SDValue Src = Op.getOperand(1);
27039 SDValue Rnd = Op.getOperand(2);
27040 SDValue PassThru = Op.getOperand(3);
27041 SDValue Mask = Op.getOperand(4);
27042
27043 unsigned RC = 0;
27044 unsigned Opc = IntrData->Opc0;
27045 bool SAE = Src.getValueType().is512BitVector() &&
27046 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
27047 if (SAE) {
27049 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
27050 }
27051
27052 if (isAllOnesConstant(Mask))
27053 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
27054
27055 if (SAE)
27057 else
27058 Opc = IntrData->Opc1;
27059 MVT SrcVT = Src.getSimpleValueType();
27060 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27061 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27062 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
27063 }
27064 case CVTNEPS2BF16_MASK: {
27065 SDValue Src = Op.getOperand(1);
27066 SDValue PassThru = Op.getOperand(2);
27067 SDValue Mask = Op.getOperand(3);
27068
27069 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
27070 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27071
27072 // Break false dependency.
27073 if (PassThru.isUndef())
27074 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
27075
27076 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
27077 Mask);
27078 }
27079 default:
27080 break;
27081 }
27082 }
27083
27084 switch (IntNo) {
27085 default: return SDValue(); // Don't custom lower most intrinsics.
27086
27087 // ptest and testp intrinsics. The intrinsic these come from are designed to
27088 // return an integer value, not just an instruction so lower it to the ptest
27089 // or testp pattern and a setcc for the result.
27090 case Intrinsic::x86_avx512_ktestc_b:
27091 case Intrinsic::x86_avx512_ktestc_w:
27092 case Intrinsic::x86_avx512_ktestc_d:
27093 case Intrinsic::x86_avx512_ktestc_q:
27094 case Intrinsic::x86_avx512_ktestz_b:
27095 case Intrinsic::x86_avx512_ktestz_w:
27096 case Intrinsic::x86_avx512_ktestz_d:
27097 case Intrinsic::x86_avx512_ktestz_q:
27098 case Intrinsic::x86_sse41_ptestz:
27099 case Intrinsic::x86_sse41_ptestc:
27100 case Intrinsic::x86_sse41_ptestnzc:
27101 case Intrinsic::x86_avx_ptestz_256:
27102 case Intrinsic::x86_avx_ptestc_256:
27103 case Intrinsic::x86_avx_ptestnzc_256:
27104 case Intrinsic::x86_avx_vtestz_ps:
27105 case Intrinsic::x86_avx_vtestc_ps:
27106 case Intrinsic::x86_avx_vtestnzc_ps:
27107 case Intrinsic::x86_avx_vtestz_pd:
27108 case Intrinsic::x86_avx_vtestc_pd:
27109 case Intrinsic::x86_avx_vtestnzc_pd:
27110 case Intrinsic::x86_avx_vtestz_ps_256:
27111 case Intrinsic::x86_avx_vtestc_ps_256:
27112 case Intrinsic::x86_avx_vtestnzc_ps_256:
27113 case Intrinsic::x86_avx_vtestz_pd_256:
27114 case Intrinsic::x86_avx_vtestc_pd_256:
27115 case Intrinsic::x86_avx_vtestnzc_pd_256: {
27116 unsigned TestOpc = X86ISD::PTEST;
27117 X86::CondCode X86CC;
27118 switch (IntNo) {
27119 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
27120 case Intrinsic::x86_avx512_ktestc_b:
27121 case Intrinsic::x86_avx512_ktestc_w:
27122 case Intrinsic::x86_avx512_ktestc_d:
27123 case Intrinsic::x86_avx512_ktestc_q:
27124 // CF = 1
27125 TestOpc = X86ISD::KTEST;
27126 X86CC = X86::COND_B;
27127 break;
27128 case Intrinsic::x86_avx512_ktestz_b:
27129 case Intrinsic::x86_avx512_ktestz_w:
27130 case Intrinsic::x86_avx512_ktestz_d:
27131 case Intrinsic::x86_avx512_ktestz_q:
27132 TestOpc = X86ISD::KTEST;
27133 X86CC = X86::COND_E;
27134 break;
27135 case Intrinsic::x86_avx_vtestz_ps:
27136 case Intrinsic::x86_avx_vtestz_pd:
27137 case Intrinsic::x86_avx_vtestz_ps_256:
27138 case Intrinsic::x86_avx_vtestz_pd_256:
27139 TestOpc = X86ISD::TESTP;
27140 [[fallthrough]];
27141 case Intrinsic::x86_sse41_ptestz:
27142 case Intrinsic::x86_avx_ptestz_256:
27143 // ZF = 1
27144 X86CC = X86::COND_E;
27145 break;
27146 case Intrinsic::x86_avx_vtestc_ps:
27147 case Intrinsic::x86_avx_vtestc_pd:
27148 case Intrinsic::x86_avx_vtestc_ps_256:
27149 case Intrinsic::x86_avx_vtestc_pd_256:
27150 TestOpc = X86ISD::TESTP;
27151 [[fallthrough]];
27152 case Intrinsic::x86_sse41_ptestc:
27153 case Intrinsic::x86_avx_ptestc_256:
27154 // CF = 1
27155 X86CC = X86::COND_B;
27156 break;
27157 case Intrinsic::x86_avx_vtestnzc_ps:
27158 case Intrinsic::x86_avx_vtestnzc_pd:
27159 case Intrinsic::x86_avx_vtestnzc_ps_256:
27160 case Intrinsic::x86_avx_vtestnzc_pd_256:
27161 TestOpc = X86ISD::TESTP;
27162 [[fallthrough]];
27163 case Intrinsic::x86_sse41_ptestnzc:
27164 case Intrinsic::x86_avx_ptestnzc_256:
27165 // ZF and CF = 0
27166 X86CC = X86::COND_A;
27167 break;
27168 }
27169
27170 SDValue LHS = Op.getOperand(1);
27171 SDValue RHS = Op.getOperand(2);
27172 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
27173 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
27174 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27175 }
27176
27177 case Intrinsic::x86_sse42_pcmpistria128:
27178 case Intrinsic::x86_sse42_pcmpestria128:
27179 case Intrinsic::x86_sse42_pcmpistric128:
27180 case Intrinsic::x86_sse42_pcmpestric128:
27181 case Intrinsic::x86_sse42_pcmpistrio128:
27182 case Intrinsic::x86_sse42_pcmpestrio128:
27183 case Intrinsic::x86_sse42_pcmpistris128:
27184 case Intrinsic::x86_sse42_pcmpestris128:
27185 case Intrinsic::x86_sse42_pcmpistriz128:
27186 case Intrinsic::x86_sse42_pcmpestriz128: {
27187 unsigned Opcode;
27188 X86::CondCode X86CC;
27189 switch (IntNo) {
27190 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27191 case Intrinsic::x86_sse42_pcmpistria128:
27192 Opcode = X86ISD::PCMPISTR;
27193 X86CC = X86::COND_A;
27194 break;
27195 case Intrinsic::x86_sse42_pcmpestria128:
27196 Opcode = X86ISD::PCMPESTR;
27197 X86CC = X86::COND_A;
27198 break;
27199 case Intrinsic::x86_sse42_pcmpistric128:
27200 Opcode = X86ISD::PCMPISTR;
27201 X86CC = X86::COND_B;
27202 break;
27203 case Intrinsic::x86_sse42_pcmpestric128:
27204 Opcode = X86ISD::PCMPESTR;
27205 X86CC = X86::COND_B;
27206 break;
27207 case Intrinsic::x86_sse42_pcmpistrio128:
27208 Opcode = X86ISD::PCMPISTR;
27209 X86CC = X86::COND_O;
27210 break;
27211 case Intrinsic::x86_sse42_pcmpestrio128:
27212 Opcode = X86ISD::PCMPESTR;
27213 X86CC = X86::COND_O;
27214 break;
27215 case Intrinsic::x86_sse42_pcmpistris128:
27216 Opcode = X86ISD::PCMPISTR;
27217 X86CC = X86::COND_S;
27218 break;
27219 case Intrinsic::x86_sse42_pcmpestris128:
27220 Opcode = X86ISD::PCMPESTR;
27221 X86CC = X86::COND_S;
27222 break;
27223 case Intrinsic::x86_sse42_pcmpistriz128:
27224 Opcode = X86ISD::PCMPISTR;
27225 X86CC = X86::COND_E;
27226 break;
27227 case Intrinsic::x86_sse42_pcmpestriz128:
27228 Opcode = X86ISD::PCMPESTR;
27229 X86CC = X86::COND_E;
27230 break;
27231 }
27233 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27234 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
27235 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
27236 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27237 }
27238
27239 case Intrinsic::x86_sse42_pcmpistri128:
27240 case Intrinsic::x86_sse42_pcmpestri128: {
27241 unsigned Opcode;
27242 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
27243 Opcode = X86ISD::PCMPISTR;
27244 else
27245 Opcode = X86ISD::PCMPESTR;
27246
27248 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27249 return DAG.getNode(Opcode, dl, VTs, NewOps);
27250 }
27251
27252 case Intrinsic::x86_sse42_pcmpistrm128:
27253 case Intrinsic::x86_sse42_pcmpestrm128: {
27254 unsigned Opcode;
27255 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
27256 Opcode = X86ISD::PCMPISTR;
27257 else
27258 Opcode = X86ISD::PCMPESTR;
27259
27261 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27262 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
27263 }
27264
27265 case Intrinsic::eh_sjlj_lsda: {
27266 MachineFunction &MF = DAG.getMachineFunction();
27267 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27268 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
27269 auto &Context = MF.getContext();
27270 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
27271 Twine(MF.getFunctionNumber()));
27272 return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
27273 DAG.getMCSymbol(S, PtrVT));
27274 }
27275
27276 case Intrinsic::x86_seh_lsda: {
27277 // Compute the symbol for the LSDA. We know it'll get emitted later.
27278 MachineFunction &MF = DAG.getMachineFunction();
27279 SDValue Op1 = Op.getOperand(1);
27280 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
27283
27284 // Generate a simple absolute symbol reference. This intrinsic is only
27285 // supported on 32-bit Windows, which isn't PIC.
27286 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
27287 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
27288 }
27289
27290 case Intrinsic::eh_recoverfp: {
27291 SDValue FnOp = Op.getOperand(1);
27292 SDValue IncomingFPOp = Op.getOperand(2);
27293 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
27294 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
27295 if (!Fn)
27297 "llvm.eh.recoverfp must take a function as the first argument");
27298 return recoverFramePointer(DAG, Fn, IncomingFPOp);
27299 }
27300
27301 case Intrinsic::localaddress: {
27302 // Returns one of the stack, base, or frame pointer registers, depending on
27303 // which is used to reference local variables.
27304 MachineFunction &MF = DAG.getMachineFunction();
27305 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27306 Register Reg;
27307 if (RegInfo->hasBasePointer(MF))
27308 Reg = RegInfo->getBaseRegister();
27309 else { // Handles the SP or FP case.
27310 bool CantUseFP = RegInfo->hasStackRealignment(MF);
27311 if (CantUseFP)
27312 Reg = RegInfo->getPtrSizedStackRegister(MF);
27313 else
27314 Reg = RegInfo->getPtrSizedFrameRegister(MF);
27315 }
27316 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
27317 }
27318 case Intrinsic::x86_avx512_vp2intersect_q_512:
27319 case Intrinsic::x86_avx512_vp2intersect_q_256:
27320 case Intrinsic::x86_avx512_vp2intersect_q_128:
27321 case Intrinsic::x86_avx512_vp2intersect_d_512:
27322 case Intrinsic::x86_avx512_vp2intersect_d_256:
27323 case Intrinsic::x86_avx512_vp2intersect_d_128: {
27324 SDLoc DL(Op);
27325 MVT MaskVT = Op.getSimpleValueType();
27326 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27328 Op.getOperand(1), Op.getOperand(2));
27329 SDValue Result0 =
27330 DAG.getTargetExtractSubreg(X86::sub_mask_0, DL, MaskVT, Operation);
27331 SDValue Result1 =
27332 DAG.getTargetExtractSubreg(X86::sub_mask_1, DL, MaskVT, Operation);
27333 return DAG.getMergeValues({Result0, Result1}, DL);
27334 }
27335 case Intrinsic::x86_mmx_pslli_w:
27336 case Intrinsic::x86_mmx_pslli_d:
27337 case Intrinsic::x86_mmx_pslli_q:
27338 case Intrinsic::x86_mmx_psrli_w:
27339 case Intrinsic::x86_mmx_psrli_d:
27340 case Intrinsic::x86_mmx_psrli_q:
27341 case Intrinsic::x86_mmx_psrai_w:
27342 case Intrinsic::x86_mmx_psrai_d: {
27343 SDLoc DL(Op);
27344 SDValue ShAmt = Op.getOperand(2);
27345 // If the argument is a constant, convert it to a target constant.
27346 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
27347 // Clamp out of bounds shift amounts since they will otherwise be masked
27348 // to 8-bits which may make it no longer out of bounds.
27349 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
27350 if (ShiftAmount == 0)
27351 return Op.getOperand(1);
27352
27353 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27354 Op.getOperand(0), Op.getOperand(1),
27355 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
27356 }
27357
27358 unsigned NewIntrinsic;
27359 switch (IntNo) {
27360 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27361 case Intrinsic::x86_mmx_pslli_w:
27362 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
27363 break;
27364 case Intrinsic::x86_mmx_pslli_d:
27365 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
27366 break;
27367 case Intrinsic::x86_mmx_pslli_q:
27368 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
27369 break;
27370 case Intrinsic::x86_mmx_psrli_w:
27371 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
27372 break;
27373 case Intrinsic::x86_mmx_psrli_d:
27374 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
27375 break;
27376 case Intrinsic::x86_mmx_psrli_q:
27377 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
27378 break;
27379 case Intrinsic::x86_mmx_psrai_w:
27380 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
27381 break;
27382 case Intrinsic::x86_mmx_psrai_d:
27383 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
27384 break;
27385 }
27386
27387 // The vector shift intrinsics with scalars uses 32b shift amounts but
27388 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
27389 // MMX register.
27390 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
27391 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27392 DAG.getTargetConstant(NewIntrinsic, DL,
27394 Op.getOperand(1), ShAmt);
27395 }
27396 case Intrinsic::thread_pointer: {
27397 if (Subtarget.isTargetELF()) {
27398 SDLoc dl(Op);
27399 EVT PtrVT = Op.getValueType();
27400 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
27402 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
27403 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27404 DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));
27405 }
27407 "Target OS doesn't support __builtin_thread_pointer() yet.");
27408 }
27409 }
27410}
27411
27413 SDValue Src, SDValue Mask, SDValue Base,
27414 SDValue Index, SDValue ScaleOp, SDValue Chain,
27415 const X86Subtarget &Subtarget) {
27416 SDLoc dl(Op);
27417 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27418 // Scale must be constant.
27419 if (!C)
27420 return SDValue();
27421 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27422 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27423 TLI.getPointerTy(DAG.getDataLayout()));
27424 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
27425 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27426 // If source is undef or we know it won't be used, use a zero vector
27427 // to break register dependency.
27428 // TODO: use undef instead and let BreakFalseDeps deal with it?
27429 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27430 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27431
27432 // Cast mask to an integer type.
27433 Mask = DAG.getBitcast(MaskVT, Mask);
27434
27436
27437 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27438 SDValue Res =
27440 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27441 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27442}
27443
27445 SDValue Src, SDValue Mask, SDValue Base,
27446 SDValue Index, SDValue ScaleOp, SDValue Chain,
27447 const X86Subtarget &Subtarget) {
27448 MVT VT = Op.getSimpleValueType();
27449 SDLoc dl(Op);
27450 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27451 // Scale must be constant.
27452 if (!C)
27453 return SDValue();
27454 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27455 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27456 TLI.getPointerTy(DAG.getDataLayout()));
27457 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27459 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27460
27461 // We support two versions of the gather intrinsics. One with scalar mask and
27462 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27463 if (Mask.getValueType() != MaskVT)
27464 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27465
27466 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27467 // If source is undef or we know it won't be used, use a zero vector
27468 // to break register dependency.
27469 // TODO: use undef instead and let BreakFalseDeps deal with it?
27470 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27471 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27472
27474
27475 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27476 SDValue Res =
27478 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27479 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27480}
27481
27483 SDValue Src, SDValue Mask, SDValue Base,
27484 SDValue Index, SDValue ScaleOp, SDValue Chain,
27485 const X86Subtarget &Subtarget) {
27486 SDLoc dl(Op);
27487 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27488 // Scale must be constant.
27489 if (!C)
27490 return SDValue();
27491 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27492 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27493 TLI.getPointerTy(DAG.getDataLayout()));
27494 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27495 Src.getSimpleValueType().getVectorNumElements());
27496 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27497
27498 // We support two versions of the scatter intrinsics. One with scalar mask and
27499 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27500 if (Mask.getValueType() != MaskVT)
27501 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27502
27504
27505 SDVTList VTs = DAG.getVTList(MVT::Other);
27506 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
27507 SDValue Res =
27509 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27510 return Res;
27511}
27512
27514 SDValue Mask, SDValue Base, SDValue Index,
27515 SDValue ScaleOp, SDValue Chain,
27516 const X86Subtarget &Subtarget) {
27517 SDLoc dl(Op);
27518 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27519 // Scale must be constant.
27520 if (!C)
27521 return SDValue();
27522 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27523 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27524 TLI.getPointerTy(DAG.getDataLayout()));
27525 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
27526 SDValue Segment = DAG.getRegister(0, MVT::i32);
27527 MVT MaskVT =
27528 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
27529 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27530 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
27531 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
27532 return SDValue(Res, 0);
27533}
27534
27535/// Handles the lowering of builtin intrinsics with chain that return their
27536/// value into registers EDX:EAX.
27537/// If operand ScrReg is a valid register identifier, then operand 2 of N is
27538/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
27539/// TargetOpcode.
27540/// Returns a Glue value which can be used to add extra copy-from-reg if the
27541/// expanded intrinsics implicitly defines extra registers (i.e. not just
27542/// EDX:EAX).
27544 SelectionDAG &DAG,
27545 unsigned TargetOpcode,
27546 unsigned SrcReg,
27547 const X86Subtarget &Subtarget,
27549 SDValue Chain = N->getOperand(0);
27550 SDValue Glue;
27551
27552 if (SrcReg) {
27553 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
27554 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
27555 Glue = Chain.getValue(1);
27556 }
27557
27558 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
27559 SDValue N1Ops[] = {Chain, Glue};
27560 SDNode *N1 = DAG.getMachineNode(
27561 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
27562 Chain = SDValue(N1, 0);
27563
27564 // Reads the content of XCR and returns it in registers EDX:EAX.
27565 SDValue LO, HI;
27566 if (Subtarget.is64Bit()) {
27567 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
27568 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
27569 LO.getValue(2));
27570 } else {
27571 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
27572 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
27573 LO.getValue(2));
27574 }
27575 Chain = HI.getValue(1);
27576 Glue = HI.getValue(2);
27577
27578 if (Subtarget.is64Bit()) {
27579 // Merge the two 32-bit values into a 64-bit one.
27580 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
27581 DAG.getConstant(32, DL, MVT::i8));
27582 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
27583 Results.push_back(Chain);
27584 return Glue;
27585 }
27586
27587 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
27588 SDValue Ops[] = { LO, HI };
27589 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
27590 Results.push_back(Pair);
27591 Results.push_back(Chain);
27592 return Glue;
27593}
27594
27595/// Handles the lowering of builtin intrinsics that read the time stamp counter
27596/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
27597/// READCYCLECOUNTER nodes.
27598static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
27599 SelectionDAG &DAG,
27600 const X86Subtarget &Subtarget,
27602 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
27603 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
27604 // and the EAX register is loaded with the low-order 32 bits.
27605 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
27606 /* NoRegister */0, Subtarget,
27607 Results);
27608 if (Opcode != X86::RDTSCP)
27609 return;
27610
27611 SDValue Chain = Results[1];
27612 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
27613 // the ECX register. Add 'ecx' explicitly to the chain.
27614 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
27615 Results[1] = ecx;
27616 Results.push_back(ecx.getValue(1));
27617}
27618
27620 SelectionDAG &DAG) {
27622 SDLoc DL(Op);
27623 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
27624 Results);
27625 return DAG.getMergeValues(Results, DL);
27626}
27627
27630 SDValue Chain = Op.getOperand(0);
27631 SDValue RegNode = Op.getOperand(2);
27632 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27633 if (!EHInfo)
27634 report_fatal_error("EH registrations only live in functions using WinEH");
27635
27636 // Cast the operand to an alloca, and remember the frame index.
27637 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
27638 if (!FINode)
27639 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
27640 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
27641
27642 // Return the chain operand without making any DAG nodes.
27643 return Chain;
27644}
27645
27648 SDValue Chain = Op.getOperand(0);
27649 SDValue EHGuard = Op.getOperand(2);
27650 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27651 if (!EHInfo)
27652 report_fatal_error("EHGuard only live in functions using WinEH");
27653
27654 // Cast the operand to an alloca, and remember the frame index.
27655 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
27656 if (!FINode)
27657 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
27658 EHInfo->EHGuardFrameIndex = FINode->getIndex();
27659
27660 // Return the chain operand without making any DAG nodes.
27661 return Chain;
27662}
27663
27664/// Emit Truncating Store with signed or unsigned saturation.
27665static SDValue
27666EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
27667 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
27668 SelectionDAG &DAG) {
27669 SDVTList VTs = DAG.getVTList(MVT::Other);
27670 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
27671 SDValue Ops[] = { Chain, Val, Ptr, Undef };
27672 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
27673 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27674}
27675
27676/// Emit Masked Truncating Store with signed or unsigned saturation.
27677static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
27678 const SDLoc &DL,
27679 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
27680 MachineMemOperand *MMO, SelectionDAG &DAG) {
27681 SDVTList VTs = DAG.getVTList(MVT::Other);
27682 SDValue Ops[] = { Chain, Val, Ptr, Mask };
27683 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
27684 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27685}
27686
27688 const MachineFunction &MF) {
27689 if (!Subtarget.is64Bit())
27690 return false;
27691 // 64-bit targets support extended Swift async frame setup,
27692 // except for targets that use the windows 64 prologue.
27693 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
27694}
27695
27697 SelectionDAG &DAG) {
27698 unsigned IntNo = Op.getConstantOperandVal(1);
27699 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
27700 if (!IntrData) {
27701 switch (IntNo) {
27702
27703 case Intrinsic::swift_async_context_addr: {
27704 SDLoc dl(Op);
27705 auto &MF = DAG.getMachineFunction();
27706 auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
27707 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
27709 X86FI->setHasSwiftAsyncContext(true);
27710 SDValue Chain = Op->getOperand(0);
27711 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
27712 SDValue Result =
27713 SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
27714 DAG.getTargetConstant(8, dl, MVT::i32)),
27715 0);
27716 // Return { result, chain }.
27717 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27718 CopyRBP.getValue(1));
27719 } else {
27720 // No special extended frame, create or reuse an existing stack slot.
27721 int PtrSize = Subtarget.is64Bit() ? 8 : 4;
27722 if (!X86FI->getSwiftAsyncContextFrameIdx())
27723 X86FI->setSwiftAsyncContextFrameIdx(
27724 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize),
27725 false));
27726 SDValue Result =
27727 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
27728 PtrSize == 8 ? MVT::i64 : MVT::i32);
27729 // Return { result, chain }.
27730 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27731 Op->getOperand(0));
27732 }
27733 }
27734
27735 case llvm::Intrinsic::x86_seh_ehregnode:
27736 return MarkEHRegistrationNode(Op, DAG);
27737 case llvm::Intrinsic::x86_seh_ehguard:
27738 return MarkEHGuard(Op, DAG);
27739 case llvm::Intrinsic::x86_rdpkru: {
27740 SDLoc dl(Op);
27741 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27742 // Create a RDPKRU node and pass 0 to the ECX parameter.
27743 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
27744 DAG.getConstant(0, dl, MVT::i32));
27745 }
27746 case llvm::Intrinsic::x86_wrpkru: {
27747 SDLoc dl(Op);
27748 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
27749 // to the EDX and ECX parameters.
27750 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
27751 Op.getOperand(0), Op.getOperand(2),
27752 DAG.getConstant(0, dl, MVT::i32),
27753 DAG.getConstant(0, dl, MVT::i32));
27754 }
27755 case llvm::Intrinsic::asan_check_memaccess: {
27756 // Mark this as adjustsStack because it will be lowered to a call.
27758 // Don't do anything here, we will expand these intrinsics out later.
27759 return Op;
27760 }
27761 case llvm::Intrinsic::x86_flags_read_u32:
27762 case llvm::Intrinsic::x86_flags_read_u64:
27763 case llvm::Intrinsic::x86_flags_write_u32:
27764 case llvm::Intrinsic::x86_flags_write_u64: {
27765 // We need a frame pointer because this will get lowered to a PUSH/POP
27766 // sequence.
27769 // Don't do anything here, we will expand these intrinsics out later
27770 // during FinalizeISel in EmitInstrWithCustomInserter.
27771 return Op;
27772 }
27773 case Intrinsic::x86_lwpins32:
27774 case Intrinsic::x86_lwpins64:
27775 case Intrinsic::x86_umwait:
27776 case Intrinsic::x86_tpause: {
27777 SDLoc dl(Op);
27778 SDValue Chain = Op->getOperand(0);
27779 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27780 unsigned Opcode;
27781
27782 switch (IntNo) {
27783 default: llvm_unreachable("Impossible intrinsic");
27784 case Intrinsic::x86_umwait:
27785 Opcode = X86ISD::UMWAIT;
27786 break;
27787 case Intrinsic::x86_tpause:
27788 Opcode = X86ISD::TPAUSE;
27789 break;
27790 case Intrinsic::x86_lwpins32:
27791 case Intrinsic::x86_lwpins64:
27792 Opcode = X86ISD::LWPINS;
27793 break;
27794 }
27795
27797 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
27798 Op->getOperand(3), Op->getOperand(4));
27799 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27800 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27801 Operation.getValue(1));
27802 }
27803 case Intrinsic::x86_enqcmd:
27804 case Intrinsic::x86_enqcmds: {
27805 SDLoc dl(Op);
27806 SDValue Chain = Op.getOperand(0);
27807 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27808 unsigned Opcode;
27809 switch (IntNo) {
27810 default: llvm_unreachable("Impossible intrinsic!");
27811 case Intrinsic::x86_enqcmd:
27812 Opcode = X86ISD::ENQCMD;
27813 break;
27814 case Intrinsic::x86_enqcmds:
27815 Opcode = X86ISD::ENQCMDS;
27816 break;
27817 }
27818 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
27819 Op.getOperand(3));
27820 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
27821 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27822 Operation.getValue(1));
27823 }
27824 case Intrinsic::x86_aesenc128kl:
27825 case Intrinsic::x86_aesdec128kl:
27826 case Intrinsic::x86_aesenc256kl:
27827 case Intrinsic::x86_aesdec256kl: {
27828 SDLoc DL(Op);
27829 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
27830 SDValue Chain = Op.getOperand(0);
27831 unsigned Opcode;
27832
27833 switch (IntNo) {
27834 default: llvm_unreachable("Impossible intrinsic");
27835 case Intrinsic::x86_aesenc128kl:
27836 Opcode = X86ISD::AESENC128KL;
27837 break;
27838 case Intrinsic::x86_aesdec128kl:
27839 Opcode = X86ISD::AESDEC128KL;
27840 break;
27841 case Intrinsic::x86_aesenc256kl:
27842 Opcode = X86ISD::AESENC256KL;
27843 break;
27844 case Intrinsic::x86_aesdec256kl:
27845 Opcode = X86ISD::AESDEC256KL;
27846 break;
27847 }
27848
27850 MachineMemOperand *MMO = MemIntr->getMemOperand();
27851 EVT MemVT = MemIntr->getMemoryVT();
27853 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
27854 MMO);
27855 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
27856
27857 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27858 {ZF, Operation.getValue(0), Operation.getValue(2)});
27859 }
27860 case Intrinsic::x86_aesencwide128kl:
27861 case Intrinsic::x86_aesdecwide128kl:
27862 case Intrinsic::x86_aesencwide256kl:
27863 case Intrinsic::x86_aesdecwide256kl: {
27864 SDLoc DL(Op);
27865 SDVTList VTs = DAG.getVTList(
27866 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
27867 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
27868 SDValue Chain = Op.getOperand(0);
27869 unsigned Opcode;
27870
27871 switch (IntNo) {
27872 default: llvm_unreachable("Impossible intrinsic");
27873 case Intrinsic::x86_aesencwide128kl:
27874 Opcode = X86ISD::AESENCWIDE128KL;
27875 break;
27876 case Intrinsic::x86_aesdecwide128kl:
27877 Opcode = X86ISD::AESDECWIDE128KL;
27878 break;
27879 case Intrinsic::x86_aesencwide256kl:
27880 Opcode = X86ISD::AESENCWIDE256KL;
27881 break;
27882 case Intrinsic::x86_aesdecwide256kl:
27883 Opcode = X86ISD::AESDECWIDE256KL;
27884 break;
27885 }
27886
27888 MachineMemOperand *MMO = MemIntr->getMemOperand();
27889 EVT MemVT = MemIntr->getMemoryVT();
27891 Opcode, DL, VTs,
27892 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
27893 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
27894 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
27895 MemVT, MMO);
27896 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
27897
27898 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27899 {ZF, Operation.getValue(1), Operation.getValue(2),
27900 Operation.getValue(3), Operation.getValue(4),
27901 Operation.getValue(5), Operation.getValue(6),
27902 Operation.getValue(7), Operation.getValue(8),
27903 Operation.getValue(9)});
27904 }
27905 case Intrinsic::x86_testui: {
27906 SDLoc dl(Op);
27907 SDValue Chain = Op.getOperand(0);
27908 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27909 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
27910 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27911 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27912 Operation.getValue(1));
27913 }
27914 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27915 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27916 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27917 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27918 case Intrinsic::x86_t2rpntlvwz0_internal:
27919 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27920 case Intrinsic::x86_t2rpntlvwz1_internal:
27921 case Intrinsic::x86_t2rpntlvwz1t1_internal: {
27922 auto *X86MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
27924 unsigned IntNo = Op.getConstantOperandVal(1);
27925 unsigned Opc = 0;
27926 switch (IntNo) {
27927 default:
27928 llvm_unreachable("Unexpected intrinsic!");
27929 case Intrinsic::x86_t2rpntlvwz0_internal:
27930 Opc = X86::PT2RPNTLVWZ0V;
27931 break;
27932 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27933 Opc = X86::PT2RPNTLVWZ0T1V;
27934 break;
27935 case Intrinsic::x86_t2rpntlvwz1_internal:
27936 Opc = X86::PT2RPNTLVWZ1V;
27937 break;
27938 case Intrinsic::x86_t2rpntlvwz1t1_internal:
27939 Opc = X86::PT2RPNTLVWZ1T1V;
27940 break;
27941 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27942 Opc = X86::PT2RPNTLVWZ0RSV;
27943 break;
27944 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27945 Opc = X86::PT2RPNTLVWZ0RST1V;
27946 break;
27947 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27948 Opc = X86::PT2RPNTLVWZ1RSV;
27949 break;
27950 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27951 Opc = X86::PT2RPNTLVWZ1RST1V;
27952 break;
27953 }
27954
27955 SDLoc DL(Op);
27956 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27957
27958 SDValue Ops[] = {Op.getOperand(2), // Row
27959 Op.getOperand(3), // Col0
27960 Op.getOperand(4), // Col1
27961 Op.getOperand(5), // Base
27962 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
27963 Op.getOperand(6), // Index
27964 DAG.getTargetConstant(0, DL, MVT::i32), // Disp
27965 DAG.getRegister(0, MVT::i16), // Segment
27966 Op.getOperand(0)}; // Chain
27967
27968 MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops);
27969 SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx,
27970 SDValue(Res, 0));
27971 SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx,
27972 SDValue(Res, 0));
27973 return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL);
27974 }
27975 case Intrinsic::x86_atomic_bts_rm:
27976 case Intrinsic::x86_atomic_btc_rm:
27977 case Intrinsic::x86_atomic_btr_rm: {
27978 SDLoc DL(Op);
27979 MVT VT = Op.getSimpleValueType();
27980 SDValue Chain = Op.getOperand(0);
27981 SDValue Op1 = Op.getOperand(2);
27982 SDValue Op2 = Op.getOperand(3);
27983 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
27984 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
27986 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27987 SDValue Res =
27988 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
27989 {Chain, Op1, Op2}, VT, MMO);
27990 Chain = Res.getValue(1);
27991 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
27992 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
27993 }
27994 case Intrinsic::x86_atomic_bts:
27995 case Intrinsic::x86_atomic_btc:
27996 case Intrinsic::x86_atomic_btr: {
27997 SDLoc DL(Op);
27998 MVT VT = Op.getSimpleValueType();
27999 SDValue Chain = Op.getOperand(0);
28000 SDValue Op1 = Op.getOperand(2);
28001 SDValue Op2 = Op.getOperand(3);
28002 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
28003 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
28004 : X86ISD::LBTR;
28005 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
28006 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28007 SDValue Res =
28008 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28009 {Chain, Op1, Op2, Size}, VT, MMO);
28010 Chain = Res.getValue(1);
28011 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28012 unsigned Imm = Op2->getAsZExtVal();
28013 if (Imm)
28014 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
28015 DAG.getShiftAmountConstant(Imm, VT, DL));
28016 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28017 }
28018 case Intrinsic::x86_cmpccxadd32:
28019 case Intrinsic::x86_cmpccxadd64: {
28020 SDLoc DL(Op);
28021 SDValue Chain = Op.getOperand(0);
28022 SDValue Addr = Op.getOperand(2);
28023 SDValue Src1 = Op.getOperand(3);
28024 SDValue Src2 = Op.getOperand(4);
28025 SDValue CC = Op.getOperand(5);
28026 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28028 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
28029 MVT::i32, MMO);
28030 return Operation;
28031 }
28032 case Intrinsic::x86_aadd32:
28033 case Intrinsic::x86_aadd64:
28034 case Intrinsic::x86_aand32:
28035 case Intrinsic::x86_aand64:
28036 case Intrinsic::x86_aor32:
28037 case Intrinsic::x86_aor64:
28038 case Intrinsic::x86_axor32:
28039 case Intrinsic::x86_axor64: {
28040 SDLoc DL(Op);
28041 SDValue Chain = Op.getOperand(0);
28042 SDValue Op1 = Op.getOperand(2);
28043 SDValue Op2 = Op.getOperand(3);
28044 MVT VT = Op2.getSimpleValueType();
28045 unsigned Opc = 0;
28046 switch (IntNo) {
28047 default:
28048 llvm_unreachable("Unknown Intrinsic");
28049 case Intrinsic::x86_aadd32:
28050 case Intrinsic::x86_aadd64:
28051 Opc = X86ISD::AADD;
28052 break;
28053 case Intrinsic::x86_aand32:
28054 case Intrinsic::x86_aand64:
28055 Opc = X86ISD::AAND;
28056 break;
28057 case Intrinsic::x86_aor32:
28058 case Intrinsic::x86_aor64:
28059 Opc = X86ISD::AOR;
28060 break;
28061 case Intrinsic::x86_axor32:
28062 case Intrinsic::x86_axor64:
28063 Opc = X86ISD::AXOR;
28064 break;
28065 }
28066 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
28067 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
28068 {Chain, Op1, Op2}, VT, MMO);
28069 }
28070 case Intrinsic::x86_atomic_add_cc:
28071 case Intrinsic::x86_atomic_sub_cc:
28072 case Intrinsic::x86_atomic_or_cc:
28073 case Intrinsic::x86_atomic_and_cc:
28074 case Intrinsic::x86_atomic_xor_cc: {
28075 SDLoc DL(Op);
28076 SDValue Chain = Op.getOperand(0);
28077 SDValue Op1 = Op.getOperand(2);
28078 SDValue Op2 = Op.getOperand(3);
28079 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
28080 MVT VT = Op2.getSimpleValueType();
28081 unsigned Opc = 0;
28082 switch (IntNo) {
28083 default:
28084 llvm_unreachable("Unknown Intrinsic");
28085 case Intrinsic::x86_atomic_add_cc:
28086 Opc = X86ISD::LADD;
28087 break;
28088 case Intrinsic::x86_atomic_sub_cc:
28089 Opc = X86ISD::LSUB;
28090 break;
28091 case Intrinsic::x86_atomic_or_cc:
28092 Opc = X86ISD::LOR;
28093 break;
28094 case Intrinsic::x86_atomic_and_cc:
28095 Opc = X86ISD::LAND;
28096 break;
28097 case Intrinsic::x86_atomic_xor_cc:
28098 Opc = X86ISD::LXOR;
28099 break;
28100 }
28101 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28102 SDValue LockArith =
28103 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28104 {Chain, Op1, Op2}, VT, MMO);
28105 Chain = LockArith.getValue(1);
28106 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
28107 }
28108 }
28109 return SDValue();
28110 }
28111
28112 SDLoc dl(Op);
28113 switch(IntrData->Type) {
28114 default: llvm_unreachable("Unknown Intrinsic Type");
28115 case RDSEED:
28116 case RDRAND: {
28117 // Emit the node with the right value type.
28118 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
28119 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28120
28121 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
28122 // Otherwise return the value from Rand, which is always 0, casted to i32.
28123 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
28124 DAG.getConstant(1, dl, Op->getValueType(1)),
28125 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
28126 SDValue(Result.getNode(), 1)};
28127 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
28128
28129 // Return { result, isValid, chain }.
28130 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
28131 SDValue(Result.getNode(), 2));
28132 }
28133 case GATHER_AVX2: {
28134 SDValue Chain = Op.getOperand(0);
28135 SDValue Src = Op.getOperand(2);
28136 SDValue Base = Op.getOperand(3);
28137 SDValue Index = Op.getOperand(4);
28138 SDValue Mask = Op.getOperand(5);
28139 SDValue Scale = Op.getOperand(6);
28140 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28141 Scale, Chain, Subtarget);
28142 }
28143 case GATHER: {
28144 //gather(v1, mask, index, base, scale);
28145 SDValue Chain = Op.getOperand(0);
28146 SDValue Src = Op.getOperand(2);
28147 SDValue Base = Op.getOperand(3);
28148 SDValue Index = Op.getOperand(4);
28149 SDValue Mask = Op.getOperand(5);
28150 SDValue Scale = Op.getOperand(6);
28151 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
28152 Chain, Subtarget);
28153 }
28154 case SCATTER: {
28155 //scatter(base, mask, index, v1, scale);
28156 SDValue Chain = Op.getOperand(0);
28157 SDValue Base = Op.getOperand(2);
28158 SDValue Mask = Op.getOperand(3);
28159 SDValue Index = Op.getOperand(4);
28160 SDValue Src = Op.getOperand(5);
28161 SDValue Scale = Op.getOperand(6);
28162 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28163 Scale, Chain, Subtarget);
28164 }
28165 case PREFETCH: {
28166 const APInt &HintVal = Op.getConstantOperandAPInt(6);
28167 assert((HintVal == 2 || HintVal == 3) &&
28168 "Wrong prefetch hint in intrinsic: should be 2 or 3");
28169 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
28170 SDValue Chain = Op.getOperand(0);
28171 SDValue Mask = Op.getOperand(2);
28172 SDValue Index = Op.getOperand(3);
28173 SDValue Base = Op.getOperand(4);
28174 SDValue Scale = Op.getOperand(5);
28175 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
28176 Subtarget);
28177 }
28178 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
28179 case RDTSC: {
28181 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
28182 Results);
28183 return DAG.getMergeValues(Results, dl);
28184 }
28185 // Read Performance Monitoring Counters.
28186 case RDPMC:
28187 // Read Processor Register.
28188 case RDPRU:
28189 // GetExtended Control Register.
28190 case XGETBV: {
28192
28193 // RDPMC uses ECX to select the index of the performance counter to read.
28194 // RDPRU uses ECX to select the processor register to read.
28195 // XGETBV uses ECX to select the index of the XCR register to return.
28196 // The result is stored into registers EDX:EAX.
28197 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
28198 Subtarget, Results);
28199 return DAG.getMergeValues(Results, dl);
28200 }
28201 // XTEST intrinsics.
28202 case XTEST: {
28203 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
28204 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28205
28206 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
28207 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
28208 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
28209 Ret, SDValue(InTrans.getNode(), 1));
28210 }
28213 case TRUNCATE_TO_MEM_VI32: {
28214 SDValue Mask = Op.getOperand(4);
28215 SDValue DataToTruncate = Op.getOperand(3);
28216 SDValue Addr = Op.getOperand(2);
28217 SDValue Chain = Op.getOperand(0);
28218
28220 assert(MemIntr && "Expected MemIntrinsicSDNode!");
28221
28222 EVT MemVT = MemIntr->getMemoryVT();
28223
28224 uint16_t TruncationOp = IntrData->Opc0;
28225 switch (TruncationOp) {
28226 case X86ISD::VTRUNC: {
28227 if (isAllOnesConstant(Mask)) // return just a truncate store
28228 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
28229 MemIntr->getMemOperand());
28230
28231 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28232 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28233 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
28234
28235 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
28236 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
28237 true /* truncating */);
28238 }
28239 case X86ISD::VTRUNCUS:
28240 case X86ISD::VTRUNCS: {
28241 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
28242 if (isAllOnesConstant(Mask))
28243 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
28244 MemIntr->getMemOperand(), DAG);
28245
28246 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28247 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28248
28249 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
28250 VMask, MemVT, MemIntr->getMemOperand(), DAG);
28251 }
28252 default:
28253 llvm_unreachable("Unsupported truncstore intrinsic");
28254 }
28255 }
28256 case INTR_TYPE_CAST_MMX:
28257 return SDValue(); // handled in combineINTRINSIC_*
28258 }
28259}
28260
28261SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
28262 SelectionDAG &DAG) const {
28263 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
28264 MFI.setReturnAddressIsTaken(true);
28265
28266 unsigned Depth = Op.getConstantOperandVal(0);
28267 SDLoc dl(Op);
28268 EVT PtrVT = Op.getValueType();
28269
28270 if (Depth > 0) {
28271 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
28272 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28273 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
28274 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
28275 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
28276 MachinePointerInfo());
28277 }
28278
28279 // Just load the return address.
28280 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
28281 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
28282 MachinePointerInfo());
28283}
28284
28285SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
28286 SelectionDAG &DAG) const {
28288 return getReturnAddressFrameIndex(DAG);
28289}
28290
28291SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
28292 MachineFunction &MF = DAG.getMachineFunction();
28293 MachineFrameInfo &MFI = MF.getFrameInfo();
28294 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
28295 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28296 EVT VT = Op.getValueType();
28297
28298 MFI.setFrameAddressIsTaken(true);
28299
28300 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
28301 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
28302 // is not possible to crawl up the stack without looking at the unwind codes
28303 // simultaneously.
28304 int FrameAddrIndex = FuncInfo->getFAIndex();
28305 if (!FrameAddrIndex) {
28306 // Set up a frame object for the return address.
28307 unsigned SlotSize = RegInfo->getSlotSize();
28308 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
28309 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
28310 FuncInfo->setFAIndex(FrameAddrIndex);
28311 }
28312 return DAG.getFrameIndex(FrameAddrIndex, VT);
28313 }
28314
28315 Register FrameReg =
28316 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
28317 SDLoc dl(Op); // FIXME probably not meaningful
28318 unsigned Depth = Op.getConstantOperandVal(0);
28319 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
28320 (FrameReg == X86::EBP && VT == MVT::i32)) &&
28321 "Invalid Frame Register!");
28322 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
28323 while (Depth--)
28324 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
28325 MachinePointerInfo());
28326 return FrameAddr;
28327}
28328
28329// FIXME? Maybe this could be a TableGen attribute on some registers and
28330// this table could be generated automatically from RegInfo.
28332 const MachineFunction &MF) const {
28333 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
28334
28336 .Case("esp", X86::ESP)
28337 .Case("rsp", X86::RSP)
28338 .Case("ebp", X86::EBP)
28339 .Case("rbp", X86::RBP)
28340 .Case("r14", X86::R14)
28341 .Case("r15", X86::R15)
28342 .Default(0);
28343
28344 if (Reg == X86::EBP || Reg == X86::RBP) {
28345 if (!TFI.hasFP(MF))
28346 report_fatal_error("register " + StringRef(RegName) +
28347 " is allocatable: function has no frame pointer");
28348#ifndef NDEBUG
28349 else {
28350 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28351 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
28352 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
28353 "Invalid Frame Register!");
28354 }
28355#endif
28356 }
28357
28358 return Reg;
28359}
28360
28361SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
28362 SelectionDAG &DAG) const {
28363 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28364 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
28365}
28366
28368 const Constant *PersonalityFn) const {
28369 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
28370 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28371
28372 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
28373}
28374
28376 const Constant *PersonalityFn) const {
28377 // Funclet personalities don't use selectors (the runtime does the selection).
28379 return X86::NoRegister;
28380 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28381}
28382
28384 return Subtarget.isTargetWin64();
28385}
28386
28387SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
28388 SDValue Chain = Op.getOperand(0);
28389 SDValue Offset = Op.getOperand(1);
28390 SDValue Handler = Op.getOperand(2);
28391 SDLoc dl (Op);
28392
28393 EVT PtrVT = getPointerTy(DAG.getDataLayout());
28394 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28395 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
28396 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
28397 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
28398 "Invalid Frame Register!");
28399 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
28400 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
28401
28402 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
28403 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
28404 dl));
28405 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
28406 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
28407 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
28408
28409 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
28410 DAG.getRegister(StoreAddrReg, PtrVT));
28411}
28412
28413SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
28414 SelectionDAG &DAG) const {
28415 SDLoc DL(Op);
28416 // If the subtarget is not 64bit, we may need the global base reg
28417 // after isel expand pseudo, i.e., after CGBR pass ran.
28418 // Therefore, ask for the GlobalBaseReg now, so that the pass
28419 // inserts the code for us in case we need it.
28420 // Otherwise, we will end up in a situation where we will
28421 // reference a virtual register that is not defined!
28422 if (!Subtarget.is64Bit()) {
28423 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28424 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
28425 }
28426 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
28427 DAG.getVTList(MVT::i32, MVT::Other),
28428 Op.getOperand(0), Op.getOperand(1));
28429}
28430
28431SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
28432 SelectionDAG &DAG) const {
28433 SDLoc DL(Op);
28434 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
28435 Op.getOperand(0), Op.getOperand(1));
28436}
28437
28438SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
28439 SelectionDAG &DAG) const {
28440 SDLoc DL(Op);
28441 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
28442 Op.getOperand(0));
28443}
28444
28446 return Op.getOperand(0);
28447}
28448
28449SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
28450 SelectionDAG &DAG) const {
28451 SDValue Root = Op.getOperand(0);
28452 SDValue Trmp = Op.getOperand(1); // trampoline
28453 SDValue FPtr = Op.getOperand(2); // nested function
28454 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
28455 SDLoc dl (Op);
28456
28457 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
28458 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
28459
28460 if (Subtarget.is64Bit()) {
28461 SDValue OutChains[6];
28462
28463 // Large code-model.
28464 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
28465 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
28466
28467 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
28468 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
28469
28470 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
28471
28472 // Load the pointer to the nested function into R11.
28473 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
28474 SDValue Addr = Trmp;
28475 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28476 Addr, MachinePointerInfo(TrmpAddr));
28477
28478 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28479 DAG.getConstant(2, dl, MVT::i64));
28480 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
28481 MachinePointerInfo(TrmpAddr, 2), Align(2));
28482
28483 // Load the 'nest' parameter value into R10.
28484 // R10 is specified in X86CallingConv.td
28485 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
28486 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28487 DAG.getConstant(10, dl, MVT::i64));
28488 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28489 Addr, MachinePointerInfo(TrmpAddr, 10));
28490
28491 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28492 DAG.getConstant(12, dl, MVT::i64));
28493 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
28494 MachinePointerInfo(TrmpAddr, 12), Align(2));
28495
28496 // Jump to the nested function.
28497 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
28498 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28499 DAG.getConstant(20, dl, MVT::i64));
28500 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28501 Addr, MachinePointerInfo(TrmpAddr, 20));
28502
28503 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
28504 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28505 DAG.getConstant(22, dl, MVT::i64));
28506 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
28507 Addr, MachinePointerInfo(TrmpAddr, 22));
28508
28509 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28510 } else {
28511 const Function *Func =
28512 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
28513 CallingConv::ID CC = Func->getCallingConv();
28514 unsigned NestReg;
28515
28516 switch (CC) {
28517 default:
28518 llvm_unreachable("Unsupported calling convention");
28519 case CallingConv::C:
28521 // Pass 'nest' parameter in ECX.
28522 // Must be kept in sync with X86CallingConv.td
28523 NestReg = X86::ECX;
28524
28525 // Check that ECX wasn't needed by an 'inreg' parameter.
28526 FunctionType *FTy = Func->getFunctionType();
28527 const AttributeList &Attrs = Func->getAttributes();
28528
28529 if (!Attrs.isEmpty() && !Func->isVarArg()) {
28530 unsigned InRegCount = 0;
28531 unsigned Idx = 0;
28532
28533 for (FunctionType::param_iterator I = FTy->param_begin(),
28534 E = FTy->param_end(); I != E; ++I, ++Idx)
28535 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
28536 const DataLayout &DL = DAG.getDataLayout();
28537 // FIXME: should only count parameters that are lowered to integers.
28538 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
28539 }
28540
28541 if (InRegCount > 2) {
28542 report_fatal_error("Nest register in use - reduce number of inreg"
28543 " parameters!");
28544 }
28545 }
28546 break;
28547 }
28550 case CallingConv::Fast:
28551 case CallingConv::Tail:
28553 // Pass 'nest' parameter in EAX.
28554 // Must be kept in sync with X86CallingConv.td
28555 NestReg = X86::EAX;
28556 break;
28557 }
28558
28559 SDValue OutChains[4];
28560 SDValue Addr, Disp;
28561
28562 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28563 DAG.getConstant(10, dl, MVT::i32));
28564 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
28565
28566 // This is storing the opcode for MOV32ri.
28567 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
28568 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
28569 OutChains[0] =
28570 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
28571 Trmp, MachinePointerInfo(TrmpAddr));
28572
28573 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28574 DAG.getConstant(1, dl, MVT::i32));
28575 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
28576 MachinePointerInfo(TrmpAddr, 1), Align(1));
28577
28578 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
28579 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28580 DAG.getConstant(5, dl, MVT::i32));
28581 OutChains[2] =
28582 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
28583 MachinePointerInfo(TrmpAddr, 5), Align(1));
28584
28585 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28586 DAG.getConstant(6, dl, MVT::i32));
28587 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
28588 MachinePointerInfo(TrmpAddr, 6), Align(1));
28589
28590 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28591 }
28592}
28593
28594SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
28595 SelectionDAG &DAG) const {
28596 /*
28597 The rounding mode is in bits 11:10 of FPSR, and has the following
28598 settings:
28599 00 Round to nearest
28600 01 Round to -inf
28601 10 Round to +inf
28602 11 Round to 0
28603
28604 GET_ROUNDING, on the other hand, expects the following:
28605 -1 Undefined
28606 0 Round to 0
28607 1 Round to nearest
28608 2 Round to +inf
28609 3 Round to -inf
28610
28611 To perform the conversion, we use a packed lookup table of the four 2-bit
28612 values that we can index by FPSP[11:10]
28613 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
28614
28615 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
28616 */
28617
28618 MachineFunction &MF = DAG.getMachineFunction();
28619 MVT VT = Op.getSimpleValueType();
28620 SDLoc DL(Op);
28621
28622 // Save FP Control Word to stack slot
28623 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
28624 SDValue StackSlot =
28625 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
28626
28627 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
28628
28629 SDValue Chain = Op.getOperand(0);
28630 SDValue Ops[] = {Chain, StackSlot};
28632 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
28634
28635 // Load FP Control Word from stack slot
28636 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
28637 Chain = CWD.getValue(1);
28638
28639 // Mask and turn the control bits into a shift for the lookup table.
28640 SDValue Shift =
28641 DAG.getNode(ISD::SRL, DL, MVT::i16,
28642 DAG.getNode(ISD::AND, DL, MVT::i16,
28643 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
28644 DAG.getConstant(9, DL, MVT::i8));
28645 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
28646
28647 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
28648 SDValue RetVal =
28649 DAG.getNode(ISD::AND, DL, MVT::i32,
28650 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
28651 DAG.getConstant(3, DL, MVT::i32));
28652
28653 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
28654
28655 return DAG.getMergeValues({RetVal, Chain}, DL);
28656}
28657
28658SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
28659 SelectionDAG &DAG) const {
28660 MachineFunction &MF = DAG.getMachineFunction();
28661 SDLoc DL(Op);
28662 SDValue Chain = Op.getNode()->getOperand(0);
28663
28664 // FP control word may be set only from data in memory. So we need to allocate
28665 // stack space to save/load FP control word.
28666 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
28667 SDValue StackSlot =
28668 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
28669 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
28670 MachineMemOperand *MMO =
28672
28673 // Store FP control word into memory.
28674 SDValue Ops[] = {Chain, StackSlot};
28675 Chain = DAG.getMemIntrinsicNode(
28676 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
28677
28678 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
28679 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
28680 Chain = CWD.getValue(1);
28681 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
28682 DAG.getConstant(0xf3ff, DL, MVT::i16));
28683
28684 // Calculate new rounding mode.
28685 SDValue NewRM = Op.getNode()->getOperand(1);
28686 SDValue RMBits;
28687 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
28688 uint64_t RM = CVal->getZExtValue();
28689 int FieldVal;
28690 switch (static_cast<RoundingMode>(RM)) {
28691 // clang-format off
28692 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
28693 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
28694 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
28695 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
28696 default:
28697 llvm_unreachable("rounding mode is not supported by X86 hardware");
28698 // clang-format on
28699 }
28700 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
28701 } else {
28702 // Need to convert argument into bits of control word:
28703 // 0 Round to 0 -> 11
28704 // 1 Round to nearest -> 00
28705 // 2 Round to +inf -> 10
28706 // 3 Round to -inf -> 01
28707 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
28708 // To make the conversion, put all these values into a value 0xc9 and shift
28709 // it left depending on the rounding mode:
28710 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
28711 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
28712 // ...
28713 // (0xc9 << (2 * NewRM + 4)) & 0xc00
28714 SDValue ShiftValue =
28715 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
28716 DAG.getNode(ISD::ADD, DL, MVT::i32,
28717 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
28718 DAG.getConstant(1, DL, MVT::i8)),
28719 DAG.getConstant(4, DL, MVT::i32)));
28720 SDValue Shifted =
28721 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
28722 ShiftValue);
28723 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
28724 DAG.getConstant(0xc00, DL, MVT::i16));
28725 }
28726
28727 // Update rounding mode bits and store the new FP Control Word into stack.
28728 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
28729 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
28730
28731 // Load FP control word from the slot.
28732 SDValue OpsLD[] = {Chain, StackSlot};
28733 MachineMemOperand *MMOL =
28735 Chain = DAG.getMemIntrinsicNode(
28736 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
28737
28738 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
28739 // same way but in bits 14:13.
28740 if (Subtarget.hasSSE1()) {
28741 // Store MXCSR into memory.
28742 Chain = DAG.getNode(
28743 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28744 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28745 StackSlot);
28746
28747 // Load MXCSR from stack slot and clear RM field (bits 14:13).
28748 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
28749 Chain = CWD.getValue(1);
28750 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
28751 DAG.getConstant(0xffff9fff, DL, MVT::i32));
28752
28753 // Shift X87 RM bits from 11:10 to 14:13.
28754 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
28755 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
28756 DAG.getConstant(3, DL, MVT::i8));
28757
28758 // Update rounding mode bits and store the new FP Control Word into stack.
28759 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
28760 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
28761
28762 // Load MXCSR from the slot.
28763 Chain = DAG.getNode(
28764 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28765 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28766 StackSlot);
28767 }
28768
28769 return Chain;
28770}
28771
28772const unsigned X87StateSize = 28;
28773const unsigned FPStateSize = 32;
28774[[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;
28775
28776SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,
28777 SelectionDAG &DAG) const {
28779 SDLoc DL(Op);
28780 SDValue Chain = Op->getOperand(0);
28781 SDValue Ptr = Op->getOperand(1);
28783 EVT MemVT = Node->getMemoryVT();
28785 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28786
28787 // Get x87 state, if it presents.
28788 if (Subtarget.hasX87()) {
28789 Chain =
28790 DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),
28791 {Chain, Ptr}, MemVT, MMO);
28792
28793 // FNSTENV changes the exception mask, so load back the stored environment.
28794 MachineMemOperand::Flags NewFlags =
28797 MMO = MF.getMachineMemOperand(MMO, NewFlags);
28798 Chain =
28799 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28800 {Chain, Ptr}, MemVT, MMO);
28801 }
28802
28803 // If target supports SSE, get MXCSR as well.
28804 if (Subtarget.hasSSE1()) {
28805 // Get pointer to the MXCSR location in memory.
28807 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28808 DAG.getConstant(X87StateSize, DL, PtrVT));
28809 // Store MXCSR into memory.
28810 Chain = DAG.getNode(
28811 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28812 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28813 MXCSRAddr);
28814 }
28815
28816 return Chain;
28817}
28818
28820 EVT MemVT, MachineMemOperand *MMO,
28821 SelectionDAG &DAG,
28822 const X86Subtarget &Subtarget) {
28823 // Set x87 state, if it presents.
28824 if (Subtarget.hasX87())
28825 Chain =
28826 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28827 {Chain, Ptr}, MemVT, MMO);
28828 // If target supports SSE, set MXCSR as well.
28829 if (Subtarget.hasSSE1()) {
28830 // Get pointer to the MXCSR location in memory.
28832 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28833 DAG.getConstant(X87StateSize, DL, PtrVT));
28834 // Load MXCSR from memory.
28835 Chain = DAG.getNode(
28836 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28837 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28838 MXCSRAddr);
28839 }
28840 return Chain;
28841}
28842
28843SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,
28844 SelectionDAG &DAG) const {
28845 SDLoc DL(Op);
28846 SDValue Chain = Op->getOperand(0);
28847 SDValue Ptr = Op->getOperand(1);
28849 EVT MemVT = Node->getMemoryVT();
28851 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28852 return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);
28853}
28854
28855SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
28856 SelectionDAG &DAG) const {
28857 MachineFunction &MF = DAG.getMachineFunction();
28858 SDLoc DL(Op);
28859 SDValue Chain = Op.getNode()->getOperand(0);
28860
28861 IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());
28862 ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);
28864
28865 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
28866 // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise
28867 // for compatibility with glibc.
28868 unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;
28869 FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));
28870 Constant *Zero = ConstantInt::get(ItemTy, 0);
28871 for (unsigned I = 0; I < 6; ++I)
28872 FPEnvVals.push_back(Zero);
28873
28874 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
28875 // all exceptions, sets DAZ and FTZ to 0.
28876 FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));
28877 Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);
28878 MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
28879 SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);
28880 MachinePointerInfo MPI =
28882 MachineMemOperand *MMO = MF.getMachineMemOperand(
28884
28885 return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
28886}
28887
28888// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28889uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
28890 assert((Amt < 8) && "Shift/Rotation amount out of range");
28891 switch (Opcode) {
28892 case ISD::BITREVERSE:
28893 return 0x8040201008040201ULL;
28894 case ISD::SHL:
28895 return ((0x0102040810204080ULL >> (Amt)) &
28896 (0x0101010101010101ULL * (0xFF >> (Amt))));
28897 case ISD::SRL:
28898 return ((0x0102040810204080ULL << (Amt)) &
28899 (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
28900 case ISD::SRA:
28901 return (getGFNICtrlImm(ISD::SRL, Amt) |
28902 (0x8080808080808080ULL >> (64 - (8 * Amt))));
28903 case ISD::ROTL:
28904 return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
28905 case ISD::ROTR:
28906 return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
28907 }
28908 llvm_unreachable("Unsupported GFNI opcode");
28909}
28910
28911// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28912SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL,
28913 MVT VT, unsigned Amt = 0) {
28914 assert(VT.getVectorElementType() == MVT::i8 &&
28915 (VT.getSizeInBits() % 64) == 0 && "Illegal GFNI control type");
28916 uint64_t Imm = getGFNICtrlImm(Opcode, Amt);
28917 SmallVector<SDValue> MaskBits;
28918 for (unsigned I = 0, E = VT.getSizeInBits(); I != E; I += 8) {
28919 uint64_t Bits = (Imm >> (I % 64)) & 255;
28920 MaskBits.push_back(DAG.getConstant(Bits, DL, MVT::i8));
28921 }
28922 return DAG.getBuildVector(VT, DL, MaskBits);
28923}
28924
28925/// Lower a vector CTLZ using native supported vector CTLZ instruction.
28926//
28927// i8/i16 vector implemented using dword LZCNT vector instruction
28928// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
28929// split the vector, perform operation on it's Lo a Hi part and
28930// concatenate the results.
28932 const X86Subtarget &Subtarget) {
28933 assert(Op.getOpcode() == ISD::CTLZ);
28934 SDLoc dl(Op);
28935 MVT VT = Op.getSimpleValueType();
28936 MVT EltVT = VT.getVectorElementType();
28937 unsigned NumElems = VT.getVectorNumElements();
28938
28939 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
28940 "Unsupported element type");
28941
28942 // Split vector, it's Lo and Hi parts will be handled in next iteration.
28943 if (NumElems > 16 ||
28944 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
28945 return splitVectorIntUnary(Op, DAG, dl);
28946
28947 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
28948 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
28949 "Unsupported value type for operation");
28950
28951 // Use native supported vector instruction vplzcntd.
28952 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
28953 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
28954 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
28955 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
28956
28957 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
28958}
28959
28960// Lower CTLZ using a PSHUFB lookup table implementation.
28962 const X86Subtarget &Subtarget,
28963 SelectionDAG &DAG) {
28964 MVT VT = Op.getSimpleValueType();
28965 int NumElts = VT.getVectorNumElements();
28966 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
28967 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
28968
28969 // Per-nibble leading zero PSHUFB lookup table.
28970 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
28971 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
28972 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
28973 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
28974
28976 for (int i = 0; i < NumBytes; ++i)
28977 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
28978 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
28979
28980 // Begin by bitcasting the input to byte vector, then split those bytes
28981 // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them.
28982 // If the hi input nibble is zero then we add both results together, otherwise
28983 // we just take the hi result (by masking the lo result to zero before the
28984 // add).
28985 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
28986 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
28987
28988 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
28989 SDValue Lo = Op0;
28990 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
28991 SDValue HiZ;
28992 if (CurrVT.is512BitVector()) {
28993 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
28994 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
28995 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
28996 } else {
28997 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
28998 }
28999
29000 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
29001 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
29002 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
29003 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
29004
29005 // Merge result back from vXi8 back to VT, working on the lo/hi halves
29006 // of the current vector width in the same way we did for the nibbles.
29007 // If the upper half of the input element is zero then add the halves'
29008 // leading zero counts together, otherwise just use the upper half's.
29009 // Double the width of the result until we are at target width.
29010 while (CurrVT != VT) {
29011 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
29012 int CurrNumElts = CurrVT.getVectorNumElements();
29013 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
29014 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
29015 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
29016
29017 // Check if the upper half of the input element is zero.
29018 if (CurrVT.is512BitVector()) {
29019 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29020 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
29021 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29022 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29023 } else {
29024 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
29025 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29026 }
29027 HiZ = DAG.getBitcast(NextVT, HiZ);
29028
29029 // Move the upper/lower halves to the lower bits as we'll be extending to
29030 // NextVT. Mask the lower result to zero if HiZ is true and add the results
29031 // together.
29032 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
29033 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
29034 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
29035 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
29036 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
29037 CurrVT = NextVT;
29038 }
29039
29040 return Res;
29041}
29042
29044 const X86Subtarget &Subtarget,
29045 SelectionDAG &DAG) {
29046 MVT VT = Op.getSimpleValueType();
29047
29048 if (Subtarget.hasCDI() &&
29049 // vXi8 vectors need to be promoted to 512-bits for vXi32.
29050 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
29051 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
29052
29053 // Decompose 256-bit ops into smaller 128-bit ops.
29054 if (VT.is256BitVector() && !Subtarget.hasInt256())
29055 return splitVectorIntUnary(Op, DAG, DL);
29056
29057 // Decompose 512-bit ops into smaller 256-bit ops.
29058 if (VT.is512BitVector() && !Subtarget.hasBWI())
29059 return splitVectorIntUnary(Op, DAG, DL);
29060
29061 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
29062 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
29063}
29064
29066 SelectionDAG &DAG,
29067 const X86Subtarget &Subtarget) {
29068 MVT VT = Op.getSimpleValueType();
29069 SDValue Input = Op.getOperand(0);
29070
29071 assert(VT.isVector() && VT.getVectorElementType() == MVT::i8 &&
29072 "Expected vXi8 input for GFNI-based CTLZ lowering");
29073
29074 SDValue Reversed = DAG.getNode(ISD::BITREVERSE, DL, VT, Input);
29075
29076 SDValue Neg = DAG.getNegative(Reversed, DL, VT);
29077 SDValue Filtered = DAG.getNode(ISD::AND, DL, VT, Reversed, Neg);
29078
29079 MVT VT64 = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
29080 SDValue CTTZConst = DAG.getConstant(0xAACCF0FF00000000ULL, DL, VT64);
29081 SDValue CTTZMatrix = DAG.getBitcast(VT, CTTZConst);
29082
29083 SDValue LZCNT =
29084 DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, Filtered, CTTZMatrix,
29085 DAG.getTargetConstant(8, DL, MVT::i8));
29086 return LZCNT;
29087}
29088
29089static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
29090 SelectionDAG &DAG) {
29091 MVT VT = Op.getSimpleValueType();
29092 MVT OpVT = VT;
29093 unsigned NumBits = VT.getSizeInBits();
29094 SDLoc dl(Op);
29095 unsigned Opc = Op.getOpcode();
29096
29097 if (VT.isVector() && VT.getScalarType() == MVT::i8 && Subtarget.hasGFNI())
29098 return LowerVectorCTLZ_GFNI(Op, dl, DAG, Subtarget);
29099
29100 if (VT.isVector())
29101 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
29102
29103 Op = Op.getOperand(0);
29104 if (VT == MVT::i8) {
29105 // Zero extend to i32 since there is not an i8 bsr.
29106 OpVT = MVT::i32;
29107 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
29108 }
29109
29110 // Check if we can safely pass a result though BSR for zero sources.
29111 SDValue PassThru = DAG.getUNDEF(OpVT);
29112 if (Opc == ISD::CTLZ && Subtarget.hasBitScanPassThrough() &&
29113 !DAG.isKnownNeverZero(Op))
29114 PassThru = DAG.getConstant(NumBits + NumBits - 1, dl, OpVT);
29115
29116 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
29117 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
29118 Op = DAG.getNode(X86ISD::BSR, dl, VTs, PassThru, Op);
29119
29120 // Skip CMOV if we're using a pass through value.
29121 if (Opc == ISD::CTLZ && PassThru.isUndef()) {
29122 // If src is zero (i.e. bsr sets ZF), returns NumBits.
29123 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
29124 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29125 Op.getValue(1)};
29126 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
29127 }
29128
29129 // Finally xor with NumBits-1.
29130 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
29131 DAG.getConstant(NumBits - 1, dl, OpVT));
29132
29133 if (VT == MVT::i8)
29134 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
29135 return Op;
29136}
29137
29138static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
29139 SelectionDAG &DAG) {
29140 MVT VT = Op.getSimpleValueType();
29141 unsigned NumBits = VT.getScalarSizeInBits();
29142 SDValue N0 = Op.getOperand(0);
29143 SDLoc dl(Op);
29144 bool NonZeroSrc = DAG.isKnownNeverZero(N0);
29145
29146 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
29147 "Only scalar CTTZ requires custom lowering");
29148
29149 // Check if we can safely pass a result though BSF for zero sources.
29150 SDValue PassThru = DAG.getUNDEF(VT);
29151 if (!NonZeroSrc && Subtarget.hasBitScanPassThrough())
29152 PassThru = DAG.getConstant(NumBits, dl, VT);
29153
29154 // Issue a bsf (scan bits forward) which also sets EFLAGS.
29155 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29156 Op = DAG.getNode(X86ISD::BSF, dl, VTs, PassThru, N0);
29157
29158 // Skip CMOV if src is never zero or we're using a pass through value.
29159 if (NonZeroSrc || !PassThru.isUndef())
29160 return Op;
29161
29162 // If src is zero (i.e. bsf sets ZF), returns NumBits.
29163 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
29164 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29165 Op.getValue(1)};
29166 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
29167}
29168
29170 const X86Subtarget &Subtarget) {
29171 MVT VT = Op.getSimpleValueType();
29172 SDLoc DL(Op);
29173
29174 if (VT == MVT::i16 || VT == MVT::i32)
29175 return lowerAddSubToHorizontalOp(Op, DL, DAG, Subtarget);
29176
29177 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29178 return splitVectorIntBinary(Op, DAG, DL);
29179
29180 assert(Op.getSimpleValueType().is256BitVector() &&
29181 Op.getSimpleValueType().isInteger() &&
29182 "Only handle AVX 256-bit vector integer operation");
29183 return splitVectorIntBinary(Op, DAG, DL);
29184}
29185
29187 const X86Subtarget &Subtarget) {
29188 MVT VT = Op.getSimpleValueType();
29189 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
29190 unsigned Opcode = Op.getOpcode();
29191 SDLoc DL(Op);
29192
29193 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
29194 (VT.is256BitVector() && !Subtarget.hasInt256())) {
29195 assert(Op.getSimpleValueType().isInteger() &&
29196 "Only handle AVX vector integer operation");
29197 return splitVectorIntBinary(Op, DAG, DL);
29198 }
29199
29200 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
29201 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29202 EVT SetCCResultType =
29203 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29204
29205 unsigned BitWidth = VT.getScalarSizeInBits();
29206 if (Opcode == ISD::USUBSAT) {
29207 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
29208 // Handle a special-case with a bit-hack instead of cmp+select:
29209 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
29210 // If the target can use VPTERNLOG, DAGToDAG will match this as
29211 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
29212 // "broadcast" constant load.
29214 if (C && C->getAPIntValue().isSignMask()) {
29215 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
29216 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
29217 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
29218 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
29219 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
29220 }
29221 }
29222 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
29223 // usubsat X, Y --> (X >u Y) ? X - Y : 0
29224 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
29225 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
29226 // TODO: Move this to DAGCombiner?
29227 if (SetCCResultType == VT &&
29228 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
29229 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
29230 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
29231 }
29232 }
29233
29234 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
29235 (!VT.isVector() || VT == MVT::v2i64)) {
29238 SDValue Zero = DAG.getConstant(0, DL, VT);
29239 SDValue Result =
29240 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
29241 DAG.getVTList(VT, SetCCResultType), X, Y);
29242 SDValue SumDiff = Result.getValue(0);
29243 SDValue Overflow = Result.getValue(1);
29244 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
29245 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
29246 SDValue SumNeg =
29247 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
29248 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
29249 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
29250 }
29251
29252 // Use default expansion.
29253 return SDValue();
29254}
29255
29256static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
29257 SelectionDAG &DAG) {
29258 MVT VT = Op.getSimpleValueType();
29259 SDLoc DL(Op);
29260
29261 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
29262 // Since X86 does not have CMOV for 8-bit integer, we don't convert
29263 // 8-bit integer abs to NEG and CMOV.
29264 SDValue N0 = Op.getOperand(0);
29265 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
29266 DAG.getConstant(0, DL, VT), N0);
29267 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
29268 SDValue(Neg.getNode(), 1)};
29269 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
29270 }
29271
29272 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
29273 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
29274 SDValue Src = Op.getOperand(0);
29275 SDValue Neg = DAG.getNegative(Src, DL, VT);
29276 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);
29277 }
29278
29279 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
29280 assert(VT.isInteger() &&
29281 "Only handle AVX 256-bit vector integer operation");
29282 return splitVectorIntUnary(Op, DAG, DL);
29283 }
29284
29285 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29286 return splitVectorIntUnary(Op, DAG, DL);
29287
29288 // Default to expand.
29289 return SDValue();
29290}
29291
29292static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
29293 SelectionDAG &DAG) {
29294 MVT VT = Op.getSimpleValueType();
29295 SDLoc DL(Op);
29296
29297 // For AVX1 cases, split to use legal ops.
29298 if (VT.is256BitVector() && !Subtarget.hasInt256())
29299 return splitVectorIntBinary(Op, DAG, DL);
29300
29301 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29302 return splitVectorIntBinary(Op, DAG, DL);
29303
29304 // Default to expand.
29305 return SDValue();
29306}
29307
29308static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
29309 SelectionDAG &DAG) {
29310 MVT VT = Op.getSimpleValueType();
29311 SDLoc DL(Op);
29312
29313 // For AVX1 cases, split to use legal ops.
29314 if (VT.is256BitVector() && !Subtarget.hasInt256())
29315 return splitVectorIntBinary(Op, DAG, DL);
29316
29317 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29318 return splitVectorIntBinary(Op, DAG, DL);
29319
29320 // Default to expand.
29321 return SDValue();
29322}
29323
29325 SelectionDAG &DAG) {
29326 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29327 EVT VT = Op.getValueType();
29328 SDValue X = Op.getOperand(0);
29329 SDValue Y = Op.getOperand(1);
29330 SDLoc DL(Op);
29331 bool IsMaxOp =
29332 Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29333 bool IsNum =
29334 Op.getOpcode() == ISD::FMINIMUMNUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29335 if (Subtarget.hasAVX10_2() && TLI.isTypeLegal(VT)) {
29336 unsigned Opc = 0;
29337 if (VT.isVector())
29339 else if (VT == MVT::f16 || VT == MVT::f32 || VT == MVT::f64)
29341
29342 if (Opc) {
29343 SDValue Imm =
29344 DAG.getTargetConstant(IsMaxOp + (IsNum ? 16 : 0), DL, MVT::i32);
29345 return DAG.getNode(Opc, DL, VT, X, Y, Imm, Op->getFlags());
29346 }
29347 }
29348
29349 uint64_t SizeInBits = VT.getScalarSizeInBits();
29350 APInt PreferredZero = APInt::getZero(SizeInBits);
29351 APInt OppositeZero = PreferredZero;
29352 EVT IVT = VT.changeTypeToInteger();
29353 X86ISD::NodeType MinMaxOp;
29354 if (IsMaxOp) {
29355 MinMaxOp = X86ISD::FMAX;
29356 OppositeZero.setSignBit();
29357 } else {
29358 PreferredZero.setSignBit();
29359 MinMaxOp = X86ISD::FMIN;
29360 }
29361 EVT SetCCType =
29362 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29363
29364 // The tables below show the expected result of Max in cases of NaN and
29365 // signed zeros.
29366 //
29367 // Y Y
29368 // Num xNaN +0 -0
29369 // --------------- ---------------
29370 // Num | Max | Y | +0 | +0 | +0 |
29371 // X --------------- X ---------------
29372 // xNaN | X | X/Y | -0 | +0 | -0 |
29373 // --------------- ---------------
29374 //
29375 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
29376 // reordering.
29377 //
29378 // We check if any of operands is NaN and return NaN. Then we check if any of
29379 // operands is zero or negative zero (for fmaximum and fminimum respectively)
29380 // to ensure the correct zero is returned.
29381 auto MatchesZero = [](SDValue Op, APInt Zero) {
29383 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
29384 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
29385 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
29386 return CstOp->getAPIntValue() == Zero;
29387 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
29388 Op->getOpcode() == ISD::SPLAT_VECTOR) {
29389 for (const SDValue &OpVal : Op->op_values()) {
29390 if (OpVal.isUndef())
29391 continue;
29392 auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
29393 if (!CstOp)
29394 return false;
29395 if (!CstOp->getValueAPF().isZero())
29396 continue;
29397 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
29398 return false;
29399 }
29400 return true;
29401 }
29402 return false;
29403 };
29404
29405 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
29406 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
29407 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
29408 Op->getFlags().hasNoSignedZeros() ||
29409 DAG.isKnownNeverZeroFloat(X) ||
29411 SDValue NewX, NewY;
29412 if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
29413 MatchesZero(X, OppositeZero)) {
29414 // Operands are already in right order or order does not matter.
29415 NewX = X;
29416 NewY = Y;
29417 } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
29418 NewX = Y;
29419 NewY = X;
29420 } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
29421 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
29422 if (IsXNeverNaN)
29423 std::swap(X, Y);
29424 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
29425 // xmm register.
29426 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
29428 // Bits of classes:
29429 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
29430 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
29431 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
29432 DL, MVT::i32);
29433 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
29434 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
29435 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
29436 DAG.getVectorIdxConstant(0, DL));
29437 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
29438 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
29439 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
29440 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29441 } else {
29442 SDValue IsXSigned;
29443 if (Subtarget.is64Bit() || VT != MVT::f64) {
29444 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
29445 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
29446 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
29447 } else {
29448 assert(VT == MVT::f64);
29449 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
29450 DAG.getConstantFP(0, DL, MVT::v2f64), X,
29451 DAG.getVectorIdxConstant(0, DL));
29452 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
29453 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
29454 DAG.getVectorIdxConstant(1, DL));
29455 Hi = DAG.getBitcast(MVT::i32, Hi);
29456 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
29457 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
29458 *DAG.getContext(), MVT::i32);
29459 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
29460 }
29461 if (MinMaxOp == X86ISD::FMAX) {
29462 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29463 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29464 } else {
29465 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29466 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29467 }
29468 }
29469
29470 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
29471 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
29472
29473 // If we did no ordering operands for signed zero handling and we need
29474 // to process NaN and we know that one of the operands is not NaN then:
29475 // - For minimum/maximum, put it in the first operand,
29476 // - For minimumnum/maximumnum, put it in the second operand,
29477 // and we will not need to post handle NaN after max/min.
29478 if (IgnoreSignedZero && !IgnoreNaN &&
29479 DAG.isKnownNeverNaN(IsNum ? NewX : NewY))
29480 std::swap(NewX, NewY);
29481
29482 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29483
29484 if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX))
29485 return MinMax;
29486
29487 if (DAG.isKnownNeverNaN(NewX))
29488 NewX = NewY;
29489
29490 SDValue IsNaN =
29491 DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
29492
29493 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
29494}
29495
29496static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
29497 SelectionDAG &DAG) {
29498 MVT VT = Op.getSimpleValueType();
29499 SDLoc dl(Op);
29500
29501 // For AVX1 cases, split to use legal ops.
29502 if (VT.is256BitVector() && !Subtarget.hasInt256())
29503 return splitVectorIntBinary(Op, DAG, dl);
29504
29505 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
29506 return splitVectorIntBinary(Op, DAG, dl);
29507
29508 bool IsSigned = Op.getOpcode() == ISD::ABDS;
29509 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29510
29511 if (Subtarget.canUseCMOV() && VT.isScalarInteger()) {
29512 X86::CondCode CC = IsSigned ? X86::COND_L : X86::COND_B;
29513 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29514
29515 // abds(lhs, rhs) -> select(slt(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29516 // abdu(lhs, rhs) -> select(ult(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29517 if (VT.bitsGE(MVT::i32)) {
29518 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29519 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
29520 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
29521 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, VTs, LHS, RHS);
29522 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, VTs, RHS, LHS);
29523 return DAG.getNode(X86ISD::CMOV, dl, VT, Diff1, Diff0,
29524 DAG.getTargetConstant(CC, dl, MVT::i8),
29525 Diff1.getValue(1));
29526 }
29527
29528 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
29529 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
29530 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
29531 MVT WideVT = MVT::getIntegerVT(WideBits);
29532 if (TLI.isTypeLegal(WideVT)) {
29533 SDVTList WideVTs = DAG.getVTList(WideVT, MVT::i32);
29534 SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));
29535 SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));
29536 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, WideVTs, LHS, RHS);
29537 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, WideVTs, RHS, LHS);
29538 SDValue AbsDiff = DAG.getNode(X86ISD::CMOV, dl, WideVT, Diff1, Diff0,
29539 DAG.getTargetConstant(CC, dl, MVT::i8),
29540 Diff1.getValue(1));
29541 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
29542 }
29543 }
29544
29545 // Default to expand.
29546 return SDValue();
29547}
29548
29549static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
29550 SelectionDAG &DAG) {
29551 SDLoc dl(Op);
29552 MVT VT = Op.getSimpleValueType();
29553
29554 // Decompose 256-bit ops into 128-bit ops.
29555 if (VT.is256BitVector() && !Subtarget.hasInt256())
29556 return splitVectorIntBinary(Op, DAG, dl);
29557
29558 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29559 return splitVectorIntBinary(Op, DAG, dl);
29560
29561 SDValue A = Op.getOperand(0);
29562 SDValue B = Op.getOperand(1);
29563
29564 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
29565 // vector pairs, multiply and truncate.
29566 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
29567 unsigned NumElts = VT.getVectorNumElements();
29568 unsigned NumLanes = VT.getSizeInBits() / 128;
29569 unsigned NumEltsPerLane = NumElts / NumLanes;
29570
29571 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29572 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29573 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
29574 return DAG.getNode(
29575 ISD::TRUNCATE, dl, VT,
29576 DAG.getNode(ISD::MUL, dl, ExVT,
29577 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
29578 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
29579 }
29580
29581 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29582
29583 // For vXi8 mul, try PMADDUBSW to avoid the need for extension.
29584 // Don't do this if we only need to unpack one half.
29585 if (Subtarget.hasSSSE3()) {
29586 bool BIsBuildVector = isa<BuildVectorSDNode>(B);
29587 bool IsLoLaneAllZeroOrUndef = BIsBuildVector;
29588 bool IsHiLaneAllZeroOrUndef = BIsBuildVector;
29589 if (BIsBuildVector) {
29590 for (auto [Idx, Val] : enumerate(B->ops())) {
29591 if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
29592 IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29593 else
29594 IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29595 }
29596 }
29597 if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
29598 SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
29599 SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);
29600 SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);
29601 SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);
29602 SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);
29603 RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);
29604 RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,
29605 DAG.getTargetConstant(8, dl, MVT::i8));
29606 return DAG.getNode(ISD::OR, dl, VT, RLo, DAG.getBitcast(VT, RHi));
29607 }
29608 }
29609
29610 // Extract the lo/hi parts to any extend to i16.
29611 // We're going to mask off the low byte of each result element of the
29612 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
29613 // element.
29614 SDValue Undef = DAG.getUNDEF(VT);
29615 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
29616 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
29617
29618 SDValue BLo, BHi;
29619 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29620 // If the RHS is a constant, manually unpackl/unpackh.
29621 SmallVector<SDValue, 16> LoOps, HiOps;
29622 for (unsigned i = 0; i != NumElts; i += 16) {
29623 for (unsigned j = 0; j != 8; ++j) {
29624 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
29625 MVT::i16));
29626 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
29627 MVT::i16));
29628 }
29629 }
29630
29631 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29632 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29633 } else {
29634 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
29635 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
29636 }
29637
29638 // Multiply, mask the lower 8bits of the lo/hi results and pack.
29639 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
29640 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
29641 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29642 }
29643
29644 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
29645 if (VT == MVT::v4i32) {
29646 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
29647 "Should not custom lower when pmulld is available!");
29648
29649 // Extract the odd parts.
29650 static const int UnpackMask[] = {1, 1, 3, 3};
29651 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
29652 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
29653
29654 // Multiply the even parts.
29655 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29656 DAG.getBitcast(MVT::v2i64, A),
29657 DAG.getBitcast(MVT::v2i64, B));
29658 // Now multiply odd parts.
29659 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29660 DAG.getBitcast(MVT::v2i64, Aodds),
29661 DAG.getBitcast(MVT::v2i64, Bodds));
29662
29663 Evens = DAG.getBitcast(VT, Evens);
29664 Odds = DAG.getBitcast(VT, Odds);
29665
29666 // Merge the two vectors back together with a shuffle. This expands into 2
29667 // shuffles.
29668 static const int ShufMask[] = { 0, 4, 2, 6 };
29669 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
29670 }
29671
29672 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
29673 "Only know how to lower V2I64/V4I64/V8I64 multiply");
29674 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
29675
29676 // Ahi = psrlqi(a, 32);
29677 // Bhi = psrlqi(b, 32);
29678 //
29679 // AloBlo = pmuludq(a, b);
29680 // AloBhi = pmuludq(a, Bhi);
29681 // AhiBlo = pmuludq(Ahi, b);
29682 //
29683 // Hi = psllqi(AloBhi + AhiBlo, 32);
29684 // return AloBlo + Hi;
29685 KnownBits AKnown = DAG.computeKnownBits(A);
29686 KnownBits BKnown = DAG.computeKnownBits(B);
29687
29688 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
29689 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
29690 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
29691
29692 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
29693 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
29694 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
29695
29696 SDValue Zero = DAG.getConstant(0, dl, VT);
29697
29698 // Only multiply lo/hi halves that aren't known to be zero.
29699 SDValue AloBlo = Zero;
29700 if (!ALoIsZero && !BLoIsZero)
29701 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
29702
29703 SDValue AloBhi = Zero;
29704 if (!ALoIsZero && !BHiIsZero) {
29705 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
29706 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
29707 }
29708
29709 SDValue AhiBlo = Zero;
29710 if (!AHiIsZero && !BLoIsZero) {
29711 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
29712 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
29713 }
29714
29715 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
29716 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
29717
29718 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
29719}
29720
29722 MVT VT, bool IsSigned,
29723 const X86Subtarget &Subtarget,
29724 SelectionDAG &DAG,
29725 SDValue *Low = nullptr) {
29726 unsigned NumElts = VT.getVectorNumElements();
29727
29728 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
29729 // to a vXi16 type. Do the multiplies, shift the results and pack the half
29730 // lane results back together.
29731
29732 // We'll take different approaches for signed and unsigned.
29733 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
29734 // and use pmullw to calculate the full 16-bit product.
29735 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
29736 // shift them left into the upper byte of each word. This allows us to use
29737 // pmulhw to calculate the full 16-bit product. This trick means we don't
29738 // need to sign extend the bytes to use pmullw.
29739
29740 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29741 SDValue Zero = DAG.getConstant(0, dl, VT);
29742
29743 SDValue ALo, AHi;
29744 if (IsSigned) {
29745 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
29746 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
29747 } else {
29748 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
29749 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
29750 }
29751
29752 SDValue BLo, BHi;
29753 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29754 // If the RHS is a constant, manually unpackl/unpackh and extend.
29755 SmallVector<SDValue, 16> LoOps, HiOps;
29756 for (unsigned i = 0; i != NumElts; i += 16) {
29757 for (unsigned j = 0; j != 8; ++j) {
29758 SDValue LoOp = B.getOperand(i + j);
29759 SDValue HiOp = B.getOperand(i + j + 8);
29760
29761 if (IsSigned) {
29762 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
29763 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
29764 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
29765 DAG.getConstant(8, dl, MVT::i16));
29766 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
29767 DAG.getConstant(8, dl, MVT::i16));
29768 } else {
29769 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
29770 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
29771 }
29772
29773 LoOps.push_back(LoOp);
29774 HiOps.push_back(HiOp);
29775 }
29776 }
29777
29778 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29779 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29780 } else if (IsSigned) {
29781 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
29782 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
29783 } else {
29784 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
29785 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
29786 }
29787
29788 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
29789 // pack back to vXi8.
29790 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
29791 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
29792 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
29793
29794 if (Low)
29795 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29796
29797 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
29798}
29799
29800static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
29801 SelectionDAG &DAG) {
29802 SDLoc dl(Op);
29803 MVT VT = Op.getSimpleValueType();
29804 bool IsSigned = Op->getOpcode() == ISD::MULHS;
29805 unsigned NumElts = VT.getVectorNumElements();
29806 SDValue A = Op.getOperand(0);
29807 SDValue B = Op.getOperand(1);
29808
29809 // Decompose 256-bit ops into 128-bit ops.
29810 if (VT.is256BitVector() && !Subtarget.hasInt256())
29811 return splitVectorIntBinary(Op, DAG, dl);
29812
29813 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29814 return splitVectorIntBinary(Op, DAG, dl);
29815
29816 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
29817 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
29818 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
29819 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
29820
29821 // PMULxD operations multiply each even value (starting at 0) of LHS with
29822 // the related value of RHS and produce a widen result.
29823 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29824 // => <2 x i64> <ae|cg>
29825 //
29826 // In other word, to have all the results, we need to perform two PMULxD:
29827 // 1. one with the even values.
29828 // 2. one with the odd values.
29829 // To achieve #2, with need to place the odd values at an even position.
29830 //
29831 // Place the odd value at an even position (basically, shift all values 1
29832 // step to the left):
29833 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
29834 9, -1, 11, -1, 13, -1, 15, -1};
29835 // <a|b|c|d> => <b|undef|d|undef>
29836 SDValue Odd0 =
29837 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
29838 // <e|f|g|h> => <f|undef|h|undef>
29839 SDValue Odd1 =
29840 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
29841
29842 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
29843 // ints.
29844 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
29845 unsigned Opcode =
29846 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
29847 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29848 // => <2 x i64> <ae|cg>
29849 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29850 DAG.getBitcast(MulVT, A),
29851 DAG.getBitcast(MulVT, B)));
29852 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
29853 // => <2 x i64> <bf|dh>
29854 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29855 DAG.getBitcast(MulVT, Odd0),
29856 DAG.getBitcast(MulVT, Odd1)));
29857
29858 // Shuffle it back into the right order.
29859 SmallVector<int, 16> ShufMask(NumElts);
29860 for (int i = 0; i != (int)NumElts; ++i)
29861 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
29862
29863 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
29864
29865 // If we have a signed multiply but no PMULDQ fix up the result of an
29866 // unsigned multiply.
29867 if (IsSigned && !Subtarget.hasSSE41()) {
29868 SDValue Zero = DAG.getConstant(0, dl, VT);
29869 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
29870 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
29871 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
29872 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
29873
29874 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
29875 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
29876 }
29877
29878 return Res;
29879 }
29880
29881 // Only i8 vectors should need custom lowering after this.
29882 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29883 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29884 "Unsupported vector type");
29885
29886 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
29887 // logical shift down the upper half and pack back to i8.
29888
29889 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
29890 // and then ashr/lshr the upper bits down to the lower bits before multiply.
29891
29892 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29893 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29894 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29895 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29896 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29897 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29898 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29899 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29900 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29901 }
29902
29903 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
29904}
29905
29906// Custom lowering for SMULO/UMULO.
29907static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
29908 SelectionDAG &DAG) {
29909 MVT VT = Op.getSimpleValueType();
29910
29911 // Scalars defer to LowerXALUO.
29912 if (!VT.isVector())
29913 return LowerXALUO(Op, DAG);
29914
29915 SDLoc dl(Op);
29916 bool IsSigned = Op->getOpcode() == ISD::SMULO;
29917 SDValue A = Op.getOperand(0);
29918 SDValue B = Op.getOperand(1);
29919 EVT OvfVT = Op->getValueType(1);
29920
29921 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
29922 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
29923 // Extract the LHS Lo/Hi vectors
29924 SDValue LHSLo, LHSHi;
29925 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
29926
29927 // Extract the RHS Lo/Hi vectors
29928 SDValue RHSLo, RHSHi;
29929 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
29930
29931 EVT LoOvfVT, HiOvfVT;
29932 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
29933 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
29934 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
29935
29936 // Issue the split operations.
29937 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
29938 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
29939
29940 // Join the separate data results and the overflow results.
29941 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29942 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
29943 Hi.getValue(1));
29944
29945 return DAG.getMergeValues({Res, Ovf}, dl);
29946 }
29947
29948 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29949 EVT SetccVT =
29950 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29951
29952 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29953 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29954 unsigned NumElts = VT.getVectorNumElements();
29955 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29956 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29957 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29958 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29959 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29960
29961 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29962
29963 SDValue Ovf;
29964 if (IsSigned) {
29965 SDValue High, LowSign;
29966 if (OvfVT.getVectorElementType() == MVT::i1 &&
29967 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29968 // Rather the truncating try to do the compare on vXi16 or vXi32.
29969 // Shift the high down filling with sign bits.
29970 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
29971 // Fill all 16 bits with the sign bit from the low.
29972 LowSign =
29973 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
29974 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
29975 15, DAG);
29976 SetccVT = OvfVT;
29977 if (!Subtarget.hasBWI()) {
29978 // We can't do a vXi16 compare so sign extend to v16i32.
29979 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
29980 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
29981 }
29982 } else {
29983 // Otherwise do the compare at vXi8.
29984 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29985 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
29986 LowSign =
29987 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
29988 }
29989
29990 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
29991 } else {
29992 SDValue High =
29993 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29994 if (OvfVT.getVectorElementType() == MVT::i1 &&
29995 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29996 // Rather the truncating try to do the compare on vXi16 or vXi32.
29997 SetccVT = OvfVT;
29998 if (!Subtarget.hasBWI()) {
29999 // We can't do a vXi16 compare so sign extend to v16i32.
30000 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
30001 }
30002 } else {
30003 // Otherwise do the compare at vXi8.
30004 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30005 }
30006
30007 Ovf =
30008 DAG.getSetCC(dl, SetccVT, High,
30009 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
30010 }
30011
30012 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30013
30014 return DAG.getMergeValues({Low, Ovf}, dl);
30015 }
30016
30017 SDValue Low;
30018 SDValue High =
30019 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
30020
30021 SDValue Ovf;
30022 if (IsSigned) {
30023 // SMULO overflows if the high bits don't match the sign of the low.
30024 SDValue LowSign =
30025 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30026 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30027 } else {
30028 // UMULO overflows if the high bits are non-zero.
30029 Ovf =
30030 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
30031 }
30032
30033 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30034
30035 return DAG.getMergeValues({Low, Ovf}, dl);
30036}
30037
30038SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
30039 assert(Subtarget.isTargetWin64() && "Unexpected target");
30040 EVT VT = Op.getValueType();
30041 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30042 "Unexpected return type for lowering");
30043
30044 if (isa<ConstantSDNode>(Op->getOperand(1))) {
30046 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
30047 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
30048 }
30049
30050 RTLIB::Libcall LC;
30051 bool isSigned;
30052 switch (Op->getOpcode()) {
30053 // clang-format off
30054 default: llvm_unreachable("Unexpected request for libcall!");
30055 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
30056 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
30057 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
30058 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
30059 // clang-format on
30060 }
30061
30062 SDLoc dl(Op);
30063 SDValue InChain = DAG.getEntryNode();
30064
30066 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
30067 EVT ArgVT = Op->getOperand(i).getValueType();
30068 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30069 "Unexpected argument type for lowering");
30070 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30071 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30072 MachinePointerInfo MPI =
30074 InChain =
30075 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
30076 Args.emplace_back(StackPtr, PointerType::get(*DAG.getContext(), 0));
30077 }
30078
30081
30082 TargetLowering::CallLoweringInfo CLI(DAG);
30083 CLI.setDebugLoc(dl)
30084 .setChain(InChain)
30085 .setLibCallee(
30087 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
30088 std::move(Args))
30089 .setInRegister()
30090 .setSExtResult(isSigned)
30091 .setZExtResult(!isSigned);
30092
30093 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
30094 return DAG.getBitcast(VT, CallInfo.first);
30095}
30096
30097SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
30098 SelectionDAG &DAG,
30099 SDValue &Chain) const {
30100 assert(Subtarget.isTargetWin64() && "Unexpected target");
30101 EVT VT = Op.getValueType();
30102 bool IsStrict = Op->isStrictFPOpcode();
30103
30104 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30105 EVT ArgVT = Arg.getValueType();
30106
30107 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30108 "Unexpected return type for lowering");
30109
30110 RTLIB::Libcall LC;
30111 if (Op->getOpcode() == ISD::FP_TO_SINT ||
30112 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
30113 LC = RTLIB::getFPTOSINT(ArgVT, VT);
30114 else
30115 LC = RTLIB::getFPTOUINT(ArgVT, VT);
30116 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30117
30118 SDLoc dl(Op);
30119 MakeLibCallOptions CallOptions;
30120 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30121
30123 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
30124 // expected VT (i128).
30125 std::tie(Result, Chain) =
30126 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
30127 Result = DAG.getBitcast(VT, Result);
30128 return Result;
30129}
30130
30131SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
30132 SelectionDAG &DAG) const {
30133 assert(Subtarget.isTargetWin64() && "Unexpected target");
30134 EVT VT = Op.getValueType();
30135 bool IsStrict = Op->isStrictFPOpcode();
30136
30137 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30138 EVT ArgVT = Arg.getValueType();
30139
30140 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30141 "Unexpected argument type for lowering");
30142
30143 RTLIB::Libcall LC;
30144 if (Op->getOpcode() == ISD::SINT_TO_FP ||
30145 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
30146 LC = RTLIB::getSINTTOFP(ArgVT, VT);
30147 else
30148 LC = RTLIB::getUINTTOFP(ArgVT, VT);
30149 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30150
30151 SDLoc dl(Op);
30152 MakeLibCallOptions CallOptions;
30153 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30154
30155 // Pass the i128 argument as an indirect argument on the stack.
30156 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30157 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30158 MachinePointerInfo MPI =
30160 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
30161
30163 std::tie(Result, Chain) =
30164 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
30165 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
30166}
30167
30168// Return true if the required (according to Opcode) shift-imm form is natively
30169// supported by the Subtarget
30170static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
30171 unsigned Opcode) {
30172 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30173 "Unexpected shift opcode");
30174
30175 if (!VT.isSimple())
30176 return false;
30177
30178 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30179 return false;
30180
30181 if (VT.getScalarSizeInBits() < 16)
30182 return false;
30183
30184 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
30185 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
30186 return true;
30187
30188 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
30189 (VT.is256BitVector() && Subtarget.hasInt256());
30190
30191 bool AShift = LShift && (Subtarget.hasAVX512() ||
30192 (VT != MVT::v2i64 && VT != MVT::v4i64));
30193 return (Opcode == ISD::SRA) ? AShift : LShift;
30194}
30195
30196// The shift amount is a variable, but it is the same for all vector lanes.
30197// These instructions are defined together with shift-immediate.
30198static
30200 unsigned Opcode) {
30201 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
30202}
30203
30204// Return true if the required (according to Opcode) variable-shift form is
30205// natively supported by the Subtarget
30206static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
30207 unsigned Opcode) {
30208 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30209 "Unexpected shift opcode");
30210
30211 if (!VT.isSimple())
30212 return false;
30213
30214 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30215 return false;
30216
30217 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
30218 return false;
30219
30220 // vXi16 supported only on AVX-512, BWI
30221 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
30222 return false;
30223
30224 if (Subtarget.hasAVX512() &&
30225 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
30226 return true;
30227
30228 bool LShift = VT.is128BitVector() || VT.is256BitVector();
30229 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
30230 return (Opcode == ISD::SRA) ? AShift : LShift;
30231}
30232
30234 const X86Subtarget &Subtarget) {
30235 MVT VT = Op.getSimpleValueType();
30236 SDLoc dl(Op);
30237 SDValue R = Op.getOperand(0);
30238 SDValue Amt = Op.getOperand(1);
30239 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
30240 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30241
30242 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
30243 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
30244 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
30245 SDValue Ex = DAG.getBitcast(ExVT, R);
30246
30247 // ashr(R, 63) === cmp_slt(R, 0)
30248 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
30249 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
30250 "Unsupported PCMPGT op");
30251 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
30252 }
30253
30254 if (ShiftAmt >= 32) {
30255 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
30256 SDValue Upper =
30257 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
30259 ShiftAmt - 32, DAG);
30260 if (VT == MVT::v2i64)
30261 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
30262 if (VT == MVT::v4i64)
30263 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30264 {9, 1, 11, 3, 13, 5, 15, 7});
30265 } else {
30266 // SRA upper i32, SRL whole i64 and select lower i32.
30268 ShiftAmt, DAG);
30269 SDValue Lower =
30270 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
30271 Lower = DAG.getBitcast(ExVT, Lower);
30272 if (VT == MVT::v2i64)
30273 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
30274 if (VT == MVT::v4i64)
30275 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30276 {8, 1, 10, 3, 12, 5, 14, 7});
30277 }
30278 return DAG.getBitcast(VT, Ex);
30279 };
30280
30281 // Optimize shl/srl/sra with constant shift amount.
30282 APInt APIntShiftAmt;
30283 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
30284 return SDValue();
30285
30286 // If the shift amount is out of range, return undef.
30287 if (APIntShiftAmt.uge(EltSizeInBits))
30288 return DAG.getUNDEF(VT);
30289
30290 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
30291
30292 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
30293 // Hardware support for vector shifts is sparse which makes us scalarize the
30294 // vector operations in many cases. Also, on sandybridge ADD is faster than
30295 // shl: (shl V, 1) -> (add (freeze V), (freeze V))
30296 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30297 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30298 // must be 0). (add undef, undef) however can be any value. To make this
30299 // safe, we must freeze R to ensure that register allocation uses the same
30300 // register for an undefined value. This ensures that the result will
30301 // still be even and preserves the original semantics.
30302 R = DAG.getFreeze(R);
30303 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30304 }
30305
30306 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
30307 }
30308
30309 // i64 SRA needs to be performed as partial shifts.
30310 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
30311 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
30312 Op.getOpcode() == ISD::SRA)
30313 return ArithmeticShiftRight64(ShiftAmt);
30314
30315 // If we're logical shifting an all-signbits value then we can just perform as
30316 // a mask.
30317 if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
30318 DAG.ComputeNumSignBits(R) == EltSizeInBits) {
30319 SDValue Mask = DAG.getAllOnesConstant(dl, VT);
30320 Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt);
30321 return DAG.getNode(ISD::AND, dl, VT, R, Mask);
30322 }
30323
30324 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30325 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
30326 unsigned NumElts = VT.getVectorNumElements();
30327 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30328
30329 // Simple i8 add case
30330 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30331 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30332 // must be 0). (add undef, undef) however can be any value. To make this
30333 // safe, we must freeze R to ensure that register allocation uses the same
30334 // register for an undefined value. This ensures that the result will
30335 // still be even and preserves the original semantics.
30336 R = DAG.getFreeze(R);
30337 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30338 }
30339
30340 // ashr(R, 7) === cmp_slt(R, 0)
30341 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
30342 SDValue Zeros = DAG.getConstant(0, dl, VT);
30343 if (VT.is512BitVector()) {
30344 assert(VT == MVT::v64i8 && "Unexpected element type!");
30345 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
30346 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
30347 }
30348 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
30349 }
30350
30351 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
30352 if (VT == MVT::v16i8 && Subtarget.hasXOP())
30353 return SDValue();
30354
30355 if (Subtarget.hasGFNI()) {
30356 SDValue Mask = getGFNICtrlMask(Op.getOpcode(), DAG, dl, VT, ShiftAmt);
30357 return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
30358 DAG.getTargetConstant(0, dl, MVT::i8));
30359 }
30360
30361 if (Op.getOpcode() == ISD::SHL) {
30362 // Make a large shift.
30363 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
30364 ShiftAmt, DAG);
30365 SHL = DAG.getBitcast(VT, SHL);
30366 // Zero out the rightmost bits.
30367 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
30368 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
30369 }
30370 if (Op.getOpcode() == ISD::SRL) {
30371 // Make a large shift.
30372 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
30373 ShiftAmt, DAG);
30374 SRL = DAG.getBitcast(VT, SRL);
30375 // Zero out the leftmost bits.
30376 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
30377 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
30378 }
30379 if (Op.getOpcode() == ISD::SRA) {
30380 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
30381 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30382
30383 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
30384 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
30385 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
30386 return Res;
30387 }
30388 llvm_unreachable("Unknown shift opcode.");
30389 }
30390
30391 return SDValue();
30392}
30393
30395 const X86Subtarget &Subtarget) {
30396 MVT VT = Op.getSimpleValueType();
30397 SDLoc dl(Op);
30398 SDValue R = Op.getOperand(0);
30399 SDValue Amt = Op.getOperand(1);
30400 unsigned Opcode = Op.getOpcode();
30401 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
30402
30403 int BaseShAmtIdx = -1;
30404 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
30405 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
30406 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
30407 Subtarget, DAG);
30408
30409 // vXi8 shifts - shift as v8i16 + mask result.
30410 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
30411 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
30412 VT == MVT::v64i8) &&
30413 !Subtarget.hasXOP()) {
30414 unsigned NumElts = VT.getVectorNumElements();
30415 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30416 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
30417 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
30418 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
30419
30420 // Create the mask using vXi16 shifts. For shift-rights we need to move
30421 // the upper byte down before splatting the vXi8 mask.
30422 SDValue BitMask = DAG.getAllOnesConstant(dl, ExtVT);
30423 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
30424 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
30425 if (Opcode != ISD::SHL)
30426 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
30427 8, DAG);
30428 BitMask = DAG.getBitcast(VT, BitMask);
30429 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
30430 SmallVector<int, 64>(NumElts, 0));
30431
30432 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
30433 DAG.getBitcast(ExtVT, R), BaseShAmt,
30434 BaseShAmtIdx, Subtarget, DAG);
30435 Res = DAG.getBitcast(VT, Res);
30436 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
30437
30438 if (Opcode == ISD::SRA) {
30439 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
30440 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
30441 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
30442 SignMask =
30443 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
30444 BaseShAmtIdx, Subtarget, DAG);
30445 SignMask = DAG.getBitcast(VT, SignMask);
30446 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
30447 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
30448 }
30449 return Res;
30450 }
30451 }
30452 }
30453
30454 return SDValue();
30455}
30456
30457// Convert a shift/rotate left amount to a multiplication scale factor.
30459 const X86Subtarget &Subtarget,
30460 SelectionDAG &DAG) {
30461 MVT VT = Amt.getSimpleValueType();
30462 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
30463 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
30464 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
30465 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
30466 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30467 (Subtarget.hasBWI() && VT == MVT::v64i8)))
30468 return SDValue();
30469
30470 MVT SVT = VT.getVectorElementType();
30471 unsigned SVTBits = SVT.getSizeInBits();
30472 unsigned NumElems = VT.getVectorNumElements();
30473
30474 APInt UndefElts;
30475 SmallVector<APInt> EltBits;
30476 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
30477 APInt One(SVTBits, 1);
30478 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
30479 for (unsigned I = 0; I != NumElems; ++I) {
30480 if (UndefElts[I] || EltBits[I].uge(SVTBits))
30481 continue;
30482 uint64_t ShAmt = EltBits[I].getZExtValue();
30483 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
30484 }
30485 return DAG.getBuildVector(VT, dl, Elts);
30486 }
30487
30488 // If the target doesn't support variable shifts, use either FP conversion
30489 // or integer multiplication to avoid shifting each element individually.
30490 if (VT == MVT::v4i32) {
30491 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
30492 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
30493 DAG.getConstant(0x3f800000U, dl, VT));
30494 Amt = DAG.getBitcast(MVT::v4f32, Amt);
30495 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
30496 }
30497
30498 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
30499 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
30500 SDValue Z = DAG.getConstant(0, dl, VT);
30501 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
30502 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
30503 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
30504 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
30505 if (Subtarget.hasSSE41())
30506 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
30507 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
30508 }
30509
30510 return SDValue();
30511}
30512
30513static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
30514 SelectionDAG &DAG) {
30515 MVT VT = Op.getSimpleValueType();
30516 SDLoc dl(Op);
30517 SDValue R = Op.getOperand(0);
30518 SDValue Amt = Op.getOperand(1);
30519 unsigned NumElts = VT.getVectorNumElements();
30520 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30521 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30522
30523 unsigned Opc = Op.getOpcode();
30524 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
30525 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
30526
30527 assert(VT.isVector() && "Custom lowering only for vector shifts!");
30528 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
30529
30530 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
30531 return V;
30532
30533 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
30534 return V;
30535
30536 if (supportedVectorVarShift(VT, Subtarget, Opc))
30537 return Op;
30538
30539 // i64 vector arithmetic shift can be emulated with the transform:
30540 // M = lshr(SIGN_MASK, Amt)
30541 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
30542 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
30543 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
30544 Opc == ISD::SRA) {
30545 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
30546 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
30547 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30548 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
30549 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
30550 return R;
30551 }
30552
30553 // XOP has 128-bit variable logical/arithmetic shifts.
30554 // +ve/-ve Amt = shift left/right.
30555 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
30556 VT == MVT::v8i16 || VT == MVT::v16i8)) {
30557 if (Opc == ISD::SRL || Opc == ISD::SRA)
30558 Amt = DAG.getNegative(Amt, dl, VT);
30559 if (Opc == ISD::SHL || Opc == ISD::SRL)
30560 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
30561 if (Opc == ISD::SRA)
30562 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
30563 }
30564
30565 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
30566 // shifts per-lane and then shuffle the partial results back together.
30567 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
30568 // Splat the shift amounts so the scalar shifts above will catch it.
30569 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
30570 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
30571 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
30572 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
30573 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
30574 }
30575
30576 // Build a map of inrange constant amounts with element mask where they occur.
30578 if (ConstantAmt) {
30579 for (unsigned I = 0; I != NumElts; ++I) {
30580 SDValue A = Amt.getOperand(I);
30581 if (A.isUndef() || A->getAsAPIntVal().uge(EltSizeInBits))
30582 continue;
30583 unsigned CstAmt = A->getAsAPIntVal().getZExtValue();
30584 auto [It, Inserted] = UniqueCstAmt.try_emplace(CstAmt);
30585 if (!Inserted) {
30586 It->second.setBit(I);
30587 continue;
30588 }
30589 It->second = APInt::getOneBitSet(NumElts, I);
30590 }
30591 assert(!UniqueCstAmt.empty() && "Illegal constant shift amounts");
30592 }
30593
30594 // If possible, lower this shift as a sequence of two shifts by
30595 // constant plus a BLENDing shuffle instead of scalarizing it.
30596 // Example:
30597 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
30598 //
30599 // Could be rewritten as:
30600 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
30601 //
30602 // The advantage is that the two shifts from the example would be
30603 // lowered as X86ISD::VSRLI nodes in parallel before blending.
30604 if (UniqueCstAmt.size() == 2 &&
30605 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
30606 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30607 unsigned AmtA = UniqueCstAmt.begin()->first;
30608 unsigned AmtB = std::next(UniqueCstAmt.begin())->first;
30609 const APInt &MaskA = UniqueCstAmt.begin()->second;
30610 const APInt &MaskB = std::next(UniqueCstAmt.begin())->second;
30611 SmallVector<int, 8> ShuffleMask(NumElts, SM_SentinelUndef);
30612 for (unsigned I = 0; I != NumElts; ++I) {
30613 if (MaskA[I])
30614 ShuffleMask[I] = I;
30615 if (MaskB[I])
30616 ShuffleMask[I] = I + NumElts;
30617 }
30618
30619 // Only perform this blend if we can perform it without loading a mask.
30620 if ((VT != MVT::v16i16 ||
30621 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
30622 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
30623 canWidenShuffleElements(ShuffleMask))) {
30624 SDValue Shift1 =
30625 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtA, dl, VT));
30626 SDValue Shift2 =
30627 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtB, dl, VT));
30628 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
30629 }
30630 }
30631
30632 // Constant ISD::SRA/SRL/SHL can be performed efficiently on vXiN vectors by
30633 // using vYiM vector operations where X*N == Y*M and M > N.
30634 if (ConstantAmt &&
30635 (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
30636 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16) &&
30637 !Subtarget.hasXOP()) {
30638 MVT NarrowScalarVT = VT.getScalarType();
30639 // We can do this extra fast if each pair of narrow elements is shifted by
30640 // the same amount by doing this SWAR style: use a shift to move the valid
30641 // bits to the right position, mask out any bits which crossed from one
30642 // element to the other.
30643 // This optimized lowering is only valid if the elements in a pair can
30644 // be treated identically.
30645 SmallVector<SDValue, 32> AmtWideElts(Amt->ops());
30646 SmallVector<SDValue, 32> TmpAmtWideElts;
30647 int WideEltSizeInBits = EltSizeInBits;
30648 while (WideEltSizeInBits < 32) {
30649 // AVX1 does not have psrlvd, etc. which makes interesting 32-bit shifts
30650 // unprofitable.
30651 if (WideEltSizeInBits >= 16 && !Subtarget.hasAVX2()) {
30652 break;
30653 }
30654 TmpAmtWideElts.resize(AmtWideElts.size() / 2);
30655 bool SameShifts = true;
30656 for (unsigned SrcI = 0, E = AmtWideElts.size(); SrcI != E; SrcI += 2) {
30657 unsigned DstI = SrcI / 2;
30658 // Both elements are undef? Make a note and keep going.
30659 if (AmtWideElts[SrcI].isUndef() && AmtWideElts[SrcI + 1].isUndef()) {
30660 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30661 continue;
30662 }
30663 // Even element is undef? We will shift it by the same shift amount as
30664 // the odd element.
30665 if (AmtWideElts[SrcI].isUndef()) {
30666 TmpAmtWideElts[DstI] = AmtWideElts[SrcI + 1];
30667 continue;
30668 }
30669 // Odd element is undef? We will shift it by the same shift amount as
30670 // the even element.
30671 if (AmtWideElts[SrcI + 1].isUndef()) {
30672 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30673 continue;
30674 }
30675 // Both elements are equal.
30676 if (AmtWideElts[SrcI].getNode()->getAsAPIntVal() ==
30677 AmtWideElts[SrcI + 1].getNode()->getAsAPIntVal()) {
30678 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30679 continue;
30680 }
30681 // One of the provisional wide elements will not have the same shift
30682 // amount. Let's bail.
30683 SameShifts = false;
30684 break;
30685 }
30686 if (!SameShifts) {
30687 break;
30688 }
30689 WideEltSizeInBits *= 2;
30690 std::swap(TmpAmtWideElts, AmtWideElts);
30691 }
30692 APInt APIntShiftAmt;
30693 bool IsConstantSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
30694 bool Profitable = WidenShift;
30695 // AVX512BW brings support for vpsllvw.
30696 if (WideEltSizeInBits * AmtWideElts.size() >= 512 &&
30697 WideEltSizeInBits < 32 && !Subtarget.hasBWI()) {
30698 Profitable = false;
30699 }
30700 // Leave AVX512 uniform arithmetic shifts alone, they can be implemented
30701 // fairly cheaply in other ways.
30702 if (WideEltSizeInBits * AmtWideElts.size() >= 512 && IsConstantSplat) {
30703 Profitable = false;
30704 }
30705 // Leave it up to GFNI if we have it around.
30706 // TODO: gf2p8affine is usually higher latency and more port restricted. It
30707 // is probably a win to use other strategies in some cases.
30708 if (EltSizeInBits == 8 && Subtarget.hasGFNI()) {
30709 Profitable = false;
30710 }
30711
30712 // AVX1 does not have vpand which makes our masking impractical. It does
30713 // have vandps but that is an FP instruction and crossing FP<->int typically
30714 // has some cost.
30715 if (WideEltSizeInBits * AmtWideElts.size() >= 256 &&
30716 (WideEltSizeInBits < 32 || IsConstantSplat) && !Subtarget.hasAVX2()) {
30717 Profitable = false;
30718 }
30719 unsigned WideNumElts = AmtWideElts.size();
30720 // We are only dealing with identical pairs.
30721 if (Profitable && WideNumElts != NumElts) {
30722 MVT WideScalarVT = MVT::getIntegerVT(WideEltSizeInBits);
30723 MVT WideVT = MVT::getVectorVT(WideScalarVT, WideNumElts);
30724 // Cast the operand to vXiM.
30725 SDValue RWide = DAG.getBitcast(WideVT, R);
30726 // Create our new vector of shift amounts.
30727 SDValue AmtWide = DAG.getBuildVector(
30728 MVT::getVectorVT(NarrowScalarVT, WideNumElts), dl, AmtWideElts);
30729 AmtWide = DAG.getZExtOrTrunc(AmtWide, dl, WideVT);
30730 // Perform the actual shift.
30731 unsigned LogicalOpc = Opc == ISD::SRA ? (unsigned)ISD::SRL : Opc;
30732 SDValue ShiftedR = DAG.getNode(LogicalOpc, dl, WideVT, RWide, AmtWide);
30733 // Now we need to construct a mask which will "drop" bits that get
30734 // shifted past the LSB/MSB. For a logical shift left, it will look
30735 // like:
30736 // FullMask = (1 << EltSizeInBits) - 1
30737 // Mask = FullMask << Amt
30738 //
30739 // This masking ensures that bits cannot migrate from one narrow lane to
30740 // another. The construction of this mask will be constant folded.
30741 // The mask for a logical right shift is nearly identical, the only
30742 // difference is that the all ones mask is shifted right instead of left.
30743 SDValue SplatFullMask = DAG.getAllOnesConstant(dl, VT);
30744 SDValue Mask = DAG.getNode(LogicalOpc, dl, VT, SplatFullMask, Amt);
30745 Mask = DAG.getBitcast(WideVT, Mask);
30746 // Finally, we mask the shifted vector with the SWAR mask.
30747 SDValue Masked = DAG.getNode(ISD::AND, dl, WideVT, ShiftedR, Mask);
30748 Masked = DAG.getBitcast(VT, Masked);
30749 if (Opc != ISD::SRA) {
30750 // Logical shifts are complete at this point.
30751 return Masked;
30752 }
30753 // At this point, we have done a *logical* shift right. We now need to
30754 // sign extend the result so that we get behavior equivalent to an
30755 // arithmetic shift right. Post-shifting by AmtWide, our narrow elements
30756 // are `EltSizeInBits-AmtWide` bits wide.
30757 //
30758 // To convert our `EltSizeInBits-AmtWide` bit unsigned numbers to signed
30759 // numbers as wide as `EltSizeInBits`, we need to replicate the bit at
30760 // position `EltSizeInBits-AmtWide` into the MSBs of each narrow lane. We
30761 // can use the following trick to accomplish this:
30762 // SignBitMask = 1 << (EltSizeInBits-AmtWide-1)
30763 // (Masked ^ SignBitMask) - SignBitMask
30764 //
30765 // When the sign bit is already clear, this will compute:
30766 // Masked + SignBitMask - SignBitMask
30767 //
30768 // This is equal to Masked which is what we want: the sign bit was clear
30769 // so sign extending should be a no-op.
30770 //
30771 // When the sign bit is set, this will compute:
30772 // Masked - SignBitmask - SignBitMask
30773 //
30774 // This is equal to Masked - 2*SignBitMask which will correctly sign
30775 // extend our result.
30776 SDValue SplatHighBit =
30777 DAG.getConstant(APInt::getSignMask(EltSizeInBits), dl, VT);
30778 // This does not induce recursion, all operands are constants.
30779 SDValue SignBitMask = DAG.getNode(LogicalOpc, dl, VT, SplatHighBit, Amt);
30780 SDValue FlippedSignBit =
30781 DAG.getNode(ISD::XOR, dl, VT, Masked, SignBitMask);
30782 SDValue Subtraction =
30783 DAG.getNode(ISD::SUB, dl, VT, FlippedSignBit, SignBitMask);
30784 return Subtraction;
30785 }
30786 }
30787
30788 // If possible, lower this packed shift into a vector multiply instead of
30789 // expanding it into a sequence of scalar shifts.
30790 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
30791 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
30792 Subtarget.canExtendTo512BW())))
30793 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
30794 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
30795
30796 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
30797 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
30798 if (Opc == ISD::SRL && ConstantAmt &&
30799 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30800 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30801 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30802 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30803 SDValue Zero = DAG.getConstant(0, dl, VT);
30804 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
30805 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
30806 return DAG.getSelect(dl, VT, ZAmt, R, Res);
30807 }
30808 }
30809
30810 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
30811 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
30812 // TODO: Special case handling for shift by 0/1, really we can afford either
30813 // of these cases in pre-SSE41/XOP/AVX512 but not both.
30814 if (Opc == ISD::SRA && ConstantAmt &&
30815 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
30816 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
30817 !Subtarget.hasAVX512()) ||
30818 DAG.isKnownNeverZero(Amt))) {
30819 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30820 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30821 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30822 SDValue Amt0 =
30823 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
30824 SDValue Amt1 =
30825 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
30826 SDValue Sra1 =
30827 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
30828 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
30829 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
30830 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
30831 }
30832 }
30833
30834 // v4i32 Non Uniform Shifts.
30835 // If the shift amount is constant we can shift each lane using the SSE2
30836 // immediate shifts, else we need to zero-extend each lane to the lower i64
30837 // and shift using the SSE2 variable shifts.
30838 // The separate results can then be blended together.
30839 if (VT == MVT::v4i32) {
30840 SDValue Amt0, Amt1, Amt2, Amt3;
30841 if (ConstantAmt) {
30842 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
30843 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
30844 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
30845 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
30846 } else {
30847 // The SSE2 shifts use the lower i64 as the same shift amount for
30848 // all lanes and the upper i64 is ignored. On AVX we're better off
30849 // just zero-extending, but for SSE just duplicating the top 16-bits is
30850 // cheaper and has the same effect for out of range values.
30851 if (Subtarget.hasAVX()) {
30852 SDValue Z = DAG.getConstant(0, dl, VT);
30853 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
30854 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
30855 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
30856 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
30857 } else {
30858 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
30859 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
30860 {4, 5, 6, 7, -1, -1, -1, -1});
30861 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
30862 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
30863 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
30864 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
30865 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
30866 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
30867 }
30868 }
30869
30870 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
30871 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
30872 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
30873 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
30874 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
30875
30876 // Merge the shifted lane results optimally with/without PBLENDW.
30877 // TODO - ideally shuffle combining would handle this.
30878 if (Subtarget.hasSSE41()) {
30879 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
30880 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
30881 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
30882 }
30883 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
30884 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
30885 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
30886 }
30887
30888 // If we're shifting (per-lane) uniform vXi8 constants, we can use PSHUFB to
30889 // look up the pre-computed shift values.
30890 if ((VT == MVT::v16i8 && Subtarget.hasSSSE3()) ||
30891 (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30892 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30893 unsigned NumLanes = VT.getSizeInBits() / 128u;
30894 unsigned NumEltsPerLane = NumElts / NumLanes;
30896 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
30897 unsigned LoElt = Lane * NumEltsPerLane;
30898 APInt EltMask = APInt::getBitsSet(NumElts, LoElt, LoElt + NumEltsPerLane);
30899 KnownBits KnownLane = DAG.computeKnownBits(R, EltMask);
30900 if (!KnownLane.isConstant())
30901 break;
30902 const APInt &LaneSplat = KnownLane.getConstant();
30903 for (unsigned I = 0; I != 8; ++I) {
30904 if (Opc == ISD::SHL)
30905 LUT.push_back(LaneSplat.shl(I));
30906 else if (Opc == ISD::SRL)
30907 LUT.push_back(LaneSplat.lshr(I));
30908 else if (Opc == ISD::SRA)
30909 LUT.push_back(LaneSplat.ashr(I));
30910 }
30911 LUT.append(8, APInt::getZero(8));
30912 }
30913 if (LUT.size() == NumElts) {
30914 APInt Undefs = APInt::getSplat(NumElts, APInt(16, 0xFF00));
30915 SDValue Mask = getConstVector(LUT, Undefs, VT, DAG, dl);
30916 return DAG.getNode(X86ISD::PSHUFB, dl, VT, Mask, Amt);
30917 }
30918 }
30919
30920 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
30921 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
30922 // make the existing SSE solution better.
30923 // NOTE: We honor prefered vector width before promoting to 512-bits.
30924 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
30925 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
30926 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
30927 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
30928 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
30929 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
30930 "Unexpected vector type");
30931 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
30932 MVT ExtVT = MVT::getVectorVT(EvtSVT, NumElts);
30933 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30934 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
30935 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
30936 return DAG.getNode(ISD::TRUNCATE, dl, VT,
30937 DAG.getNode(Opc, dl, ExtVT, R, Amt));
30938 }
30939
30940 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
30941 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
30942 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
30943 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30944 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
30945 !Subtarget.hasXOP()) {
30946 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
30947 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
30948
30949 // Extend constant shift amount to vXi16 (it doesn't matter if the type
30950 // isn't legal).
30951 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30952 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
30953 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
30954 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
30956 "Constant build vector expected");
30957
30958 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
30959 bool IsSigned = Opc == ISD::SRA;
30960 R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
30961 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
30962 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
30963 return DAG.getZExtOrTrunc(R, dl, VT);
30964 }
30965
30966 SmallVector<SDValue, 16> LoAmt, HiAmt;
30967 for (unsigned i = 0; i != NumElts; i += 16) {
30968 for (int j = 0; j != 8; ++j) {
30969 LoAmt.push_back(Amt.getOperand(i + j));
30970 HiAmt.push_back(Amt.getOperand(i + j + 8));
30971 }
30972 }
30973
30974 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
30975 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
30976
30977 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
30978 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
30979 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
30980 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
30981 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
30982 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
30983 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
30984 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
30985 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
30986 }
30987
30988 if (VT == MVT::v16i8 ||
30989 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
30990 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30991 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30992
30993 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
30994 if (VT.is512BitVector()) {
30995 // On AVX512BW targets we make use of the fact that VSELECT lowers
30996 // to a masked blend which selects bytes based just on the sign bit
30997 // extracted to a mask.
30998 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
30999 V0 = DAG.getBitcast(VT, V0);
31000 V1 = DAG.getBitcast(VT, V1);
31001 Sel = DAG.getBitcast(VT, Sel);
31002 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
31003 ISD::SETGT);
31004 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
31005 } else if (Subtarget.hasSSE41()) {
31006 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31007 // on the sign bit.
31008 V0 = DAG.getBitcast(VT, V0);
31009 V1 = DAG.getBitcast(VT, V1);
31010 Sel = DAG.getBitcast(VT, Sel);
31011 return DAG.getBitcast(SelVT,
31012 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
31013 }
31014 // On pre-SSE41 targets we test for the sign bit by comparing to
31015 // zero - a negative value will set all bits of the lanes to true
31016 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31017 SDValue Z = DAG.getConstant(0, dl, SelVT);
31018 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
31019 return DAG.getSelect(dl, SelVT, C, V0, V1);
31020 };
31021
31022 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31023 // We can safely do this using i16 shifts as we're only interested in
31024 // the 3 lower bits of each byte.
31025 Amt = DAG.getBitcast(ExtVT, Amt);
31026 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
31027 Amt = DAG.getBitcast(VT, Amt);
31028
31029 if (Opc == ISD::SHL || Opc == ISD::SRL) {
31030 // r = VSELECT(r, shift(r, 4), a);
31031 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
31032 R = SignBitSelect(VT, Amt, M, R);
31033
31034 // a += a
31035 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31036
31037 // r = VSELECT(r, shift(r, 2), a);
31038 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
31039 R = SignBitSelect(VT, Amt, M, R);
31040
31041 // a += a
31042 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31043
31044 // return VSELECT(r, shift(r, 1), a);
31045 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
31046 R = SignBitSelect(VT, Amt, M, R);
31047 return R;
31048 }
31049
31050 if (Opc == ISD::SRA) {
31051 // For SRA we need to unpack each byte to the higher byte of a i16 vector
31052 // so we can correctly sign extend. We don't care what happens to the
31053 // lower byte.
31054 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31055 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31056 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
31057 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
31058 ALo = DAG.getBitcast(ExtVT, ALo);
31059 AHi = DAG.getBitcast(ExtVT, AHi);
31060 RLo = DAG.getBitcast(ExtVT, RLo);
31061 RHi = DAG.getBitcast(ExtVT, RHi);
31062
31063 // r = VSELECT(r, shift(r, 4), a);
31064 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
31065 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
31066 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31067 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31068
31069 // a += a
31070 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31071 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31072
31073 // r = VSELECT(r, shift(r, 2), a);
31074 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
31075 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
31076 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31077 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31078
31079 // a += a
31080 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31081 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31082
31083 // r = VSELECT(r, shift(r, 1), a);
31084 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
31085 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
31086 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31087 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31088
31089 // Logical shift the result back to the lower byte, leaving a zero upper
31090 // byte meaning that we can safely pack with PACKUSWB.
31091 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
31092 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
31093 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
31094 }
31095 }
31096
31097 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
31098 MVT ExtVT = MVT::v8i32;
31099 SDValue Z = DAG.getConstant(0, dl, VT);
31100 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
31101 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
31102 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
31103 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
31104 ALo = DAG.getBitcast(ExtVT, ALo);
31105 AHi = DAG.getBitcast(ExtVT, AHi);
31106 RLo = DAG.getBitcast(ExtVT, RLo);
31107 RHi = DAG.getBitcast(ExtVT, RHi);
31108 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
31109 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
31110 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
31111 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
31112 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
31113 }
31114
31115 if (VT == MVT::v8i16) {
31116 // If we have a constant shift amount, the non-SSE41 path is best as
31117 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
31118 bool UseSSE41 = Subtarget.hasSSE41() &&
31120
31121 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
31122 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
31123 // the sign bit.
31124 if (UseSSE41) {
31125 MVT ExtVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
31126 V0 = DAG.getBitcast(ExtVT, V0);
31127 V1 = DAG.getBitcast(ExtVT, V1);
31128 Sel = DAG.getBitcast(ExtVT, Sel);
31129 return DAG.getBitcast(
31130 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
31131 }
31132 // On pre-SSE41 targets we splat the sign bit - a negative value will
31133 // set all bits of the lanes to true and VSELECT uses that in
31134 // its OR(AND(V0,C),AND(V1,~C)) lowering.
31135 SDValue C =
31136 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
31137 return DAG.getSelect(dl, VT, C, V0, V1);
31138 };
31139
31140 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
31141 if (UseSSE41) {
31142 // On SSE41 targets we need to replicate the shift mask in both
31143 // bytes for PBLENDVB.
31144 Amt = DAG.getNode(
31145 ISD::OR, dl, VT,
31146 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
31147 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
31148 } else {
31149 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
31150 }
31151
31152 // r = VSELECT(r, shift(r, 8), a);
31153 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
31154 R = SignBitSelect(Amt, M, R);
31155
31156 // a += a
31157 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31158
31159 // r = VSELECT(r, shift(r, 4), a);
31160 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
31161 R = SignBitSelect(Amt, M, R);
31162
31163 // a += a
31164 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31165
31166 // r = VSELECT(r, shift(r, 2), a);
31167 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
31168 R = SignBitSelect(Amt, M, R);
31169
31170 // a += a
31171 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31172
31173 // return VSELECT(r, shift(r, 1), a);
31174 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
31175 R = SignBitSelect(Amt, M, R);
31176 return R;
31177 }
31178
31179 // Decompose 256-bit shifts into 128-bit shifts.
31180 if (VT.is256BitVector())
31181 return splitVectorIntBinary(Op, DAG, dl);
31182
31183 if (VT == MVT::v32i16 || VT == MVT::v64i8)
31184 return splitVectorIntBinary(Op, DAG, dl);
31185
31186 return SDValue();
31187}
31188
31190 SelectionDAG &DAG) {
31191 MVT VT = Op.getSimpleValueType();
31192 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
31193 "Unexpected funnel shift opcode!");
31194
31195 SDLoc DL(Op);
31196 SDValue Op0 = Op.getOperand(0);
31197 SDValue Op1 = Op.getOperand(1);
31198 SDValue Amt = Op.getOperand(2);
31199 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31200 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
31201
31202 if (VT.isVector()) {
31203 APInt APIntShiftAmt;
31204 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
31205 unsigned NumElts = VT.getVectorNumElements();
31206
31207 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
31208 if (IsFSHR)
31209 std::swap(Op0, Op1);
31210
31211 if (IsCstSplat) {
31212 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31213 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
31214 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
31215 {Op0, Op1, Imm}, DAG, Subtarget);
31216 }
31217 return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
31218 {Op0, Op1, Amt}, DAG, Subtarget);
31219 }
31220 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
31221 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
31222 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
31223 "Unexpected funnel shift type!");
31224
31225 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
31226 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
31227 if (IsCstSplat) {
31228 // TODO: Can't use generic expansion as UNDEF amt elements can be
31229 // converted to other values when folded to shift amounts, losing the
31230 // splat.
31231 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31232 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
31233 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
31234 assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
31235 MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31236
31237 if (EltSizeInBits == 8 &&
31238 (Subtarget.hasXOP() ||
31239 (useVPTERNLOG(Subtarget, VT) &&
31240 supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) {
31241 // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
31242 // bit-select - lower using vXi16 shifts and then perform the bitmask at
31243 // the original vector width to handle cases where we split.
31244 APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
31245 APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
31246 SDValue ShX =
31247 DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0),
31248 DAG.getShiftAmountConstant(ShXAmt, WideVT, DL));
31249 SDValue ShY =
31250 DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1),
31251 DAG.getShiftAmountConstant(ShYAmt, WideVT, DL));
31252 ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX),
31253 DAG.getConstant(MaskX, DL, VT));
31254 ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY),
31255 DAG.getConstant(MaskY, DL, VT));
31256 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31257 }
31258
31259 SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
31260 DAG.getShiftAmountConstant(ShXAmt, VT, DL));
31261 SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
31262 DAG.getShiftAmountConstant(ShYAmt, VT, DL));
31263 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31264 }
31265
31266 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31267 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31268 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
31269
31270 // Constant vXi16 funnel shifts can be efficiently handled by default.
31271 if (IsCst && EltSizeInBits == 16)
31272 return SDValue();
31273
31274 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
31275 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31276 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31277
31278 // Split 256-bit integers on XOP/pre-AVX2 targets.
31279 // Split 512-bit integers on non 512-bit BWI targets.
31280 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
31281 !Subtarget.hasAVX2())) ||
31282 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
31283 EltSizeInBits < 32)) {
31284 // Pre-mask the amount modulo using the wider vector.
31285 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
31286 return splitVectorOp(Op, DAG, DL);
31287 }
31288
31289 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
31290 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
31291 int ScalarAmtIdx = -1;
31292 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
31293 // Uniform vXi16 funnel shifts can be efficiently handled by default.
31294 if (EltSizeInBits == 16)
31295 return SDValue();
31296
31297 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31298 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31299 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
31300 ScalarAmtIdx, Subtarget, DAG);
31301 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
31302 ScalarAmtIdx, Subtarget, DAG);
31303 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31304 }
31305 }
31306
31307 MVT WideSVT = MVT::getIntegerVT(
31308 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
31309 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
31310
31311 // If per-element shifts are legal, fallback to generic expansion.
31312 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
31313 return SDValue();
31314
31315 // Attempt to fold as:
31316 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31317 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31318 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31319 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31320 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
31321 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
31322 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31323 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
31324 EltSizeInBits, DAG);
31325 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
31326 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
31327 if (!IsFSHR)
31328 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
31329 EltSizeInBits, DAG);
31330 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
31331 }
31332
31333 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
31334 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
31335 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
31336 SDValue Z = DAG.getConstant(0, DL, VT);
31337 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31338 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31339 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31340 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31341 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31342 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31343 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31344 }
31345
31346 // Fallback to generic expansion.
31347 return SDValue();
31348 }
31349 assert(
31350 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
31351 "Unexpected funnel shift type!");
31352
31353 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
31354 bool OptForSize = DAG.shouldOptForSize();
31355 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
31356
31357 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31358 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31359 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
31360 !isa<ConstantSDNode>(Amt)) {
31361 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
31362 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
31363 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
31364 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
31365 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
31366 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
31367 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
31368 if (IsFSHR) {
31369 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
31370 } else {
31371 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
31372 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
31373 }
31374 return DAG.getZExtOrTrunc(Res, DL, VT);
31375 }
31376
31377 if (VT == MVT::i8 || ExpandFunnel)
31378 return SDValue();
31379
31380 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
31381 if (VT == MVT::i16) {
31382 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
31383 DAG.getConstant(15, DL, Amt.getValueType()));
31384 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
31385 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
31386 }
31387
31388 return Op;
31389}
31390
31391static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
31392 SelectionDAG &DAG) {
31393 MVT VT = Op.getSimpleValueType();
31394 assert(VT.isVector() && "Custom lowering only for vector rotates!");
31395
31396 SDLoc DL(Op);
31397 SDValue R = Op.getOperand(0);
31398 SDValue Amt = Op.getOperand(1);
31399 unsigned Opcode = Op.getOpcode();
31400 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31401 int NumElts = VT.getVectorNumElements();
31402 bool IsROTL = Opcode == ISD::ROTL;
31403
31404 // Check for constant splat rotation amount.
31405 APInt CstSplatValue;
31406 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
31407
31408 // Check for splat rotate by zero.
31409 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
31410 return R;
31411
31412 // AVX512 implicitly uses modulo rotation amounts.
31413 if ((Subtarget.hasVLX() || Subtarget.hasAVX512()) && 32 <= EltSizeInBits) {
31414 // Attempt to rotate by immediate.
31415 if (IsCstSplat) {
31416 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
31417 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31418 return DAG.getNode(RotOpc, DL, VT, R,
31419 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31420 }
31421
31422 // Else, fall-back on VPROLV/VPRORV.
31423 return Op;
31424 }
31425
31426 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
31427 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
31428 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31429 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31430 }
31431
31432 SDValue Z = DAG.getConstant(0, DL, VT);
31433
31434 if (!IsROTL) {
31435 // If the ISD::ROTR amount is constant, we're always better converting to
31436 // ISD::ROTL.
31437 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
31438 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
31439
31440 // XOP targets always prefers ISD::ROTL.
31441 if (Subtarget.hasXOP())
31442 return DAG.getNode(ISD::ROTL, DL, VT, R,
31443 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
31444 }
31445
31446 // Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
31447 if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
31449 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31450 SDValue Mask = getGFNICtrlMask(Opcode, DAG, DL, VT, RotAmt);
31451 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
31452 DAG.getTargetConstant(0, DL, MVT::i8));
31453 }
31454
31455 // Split 256-bit integers on XOP/pre-AVX2 targets.
31456 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
31457 return splitVectorIntBinary(Op, DAG, DL);
31458
31459 // XOP has 128-bit vector variable + immediate rotates.
31460 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
31461 // XOP implicitly uses modulo rotation amounts.
31462 if (Subtarget.hasXOP()) {
31463 assert(IsROTL && "Only ROTL expected");
31464 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
31465
31466 // Attempt to rotate by immediate.
31467 if (IsCstSplat) {
31468 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31469 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
31470 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31471 }
31472
31473 // Use general rotate by variable (per-element).
31474 return Op;
31475 }
31476
31477 // Rotate by an uniform constant - expand back to shifts.
31478 // TODO: Can't use generic expansion as UNDEF amt elements can be converted
31479 // to other values when folded to shift amounts, losing the splat.
31480 if (IsCstSplat) {
31481 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31482 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
31483 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
31484 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,
31485 DAG.getShiftAmountConstant(ShlAmt, VT, DL));
31486 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,
31487 DAG.getShiftAmountConstant(SrlAmt, VT, DL));
31488 return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);
31489 }
31490
31491 // Split 512-bit integers on non 512-bit BWI targets.
31492 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
31493 return splitVectorIntBinary(Op, DAG, DL);
31494
31495 assert(
31496 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
31497 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
31498 Subtarget.hasAVX2()) ||
31499 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
31500 "Only vXi32/vXi16/vXi8 vector rotates supported");
31501
31502 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31503 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31504
31505 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31506 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31507
31508 // Attempt to fold as unpack(x,x) << zext(splat(y)):
31509 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31510 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31511 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
31512 int BaseRotAmtIdx = -1;
31513 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
31514 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
31515 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31516 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31517 }
31518 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
31519 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31520 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31521 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
31522 BaseRotAmtIdx, Subtarget, DAG);
31523 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
31524 BaseRotAmtIdx, Subtarget, DAG);
31525 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31526 }
31527 }
31528
31529 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31530 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
31531
31532 // Attempt to fold as unpack(x,x) << zext(y):
31533 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31534 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31535 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
31536 if (!(ConstantAmt && EltSizeInBits != 8) &&
31537 !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
31538 (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
31539 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31540 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31541 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31542 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31543 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31544 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31545 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31546 }
31547
31548 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
31549 // the amount bit.
31550 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
31551 if (EltSizeInBits == 8) {
31552 MVT WideVT =
31553 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
31554
31555 // Attempt to fold as:
31556 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
31557 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
31558 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31559 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31560 // If we're rotating by constant, just use default promotion.
31561 if (ConstantAmt)
31562 return SDValue();
31563 // See if we can perform this by widening to vXi16 or vXi32.
31564 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
31565 R = DAG.getNode(
31566 ISD::OR, DL, WideVT, R,
31567 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
31568 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31569 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
31570 if (IsROTL)
31571 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
31572 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
31573 }
31574
31575 // We don't need ModuloAmt here as we just peek at individual bits.
31576 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31577 if (Subtarget.hasSSE41()) {
31578 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31579 // on the sign bit.
31580 V0 = DAG.getBitcast(VT, V0);
31581 V1 = DAG.getBitcast(VT, V1);
31582 Sel = DAG.getBitcast(VT, Sel);
31583 return DAG.getBitcast(SelVT,
31584 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
31585 }
31586 // On pre-SSE41 targets we test for the sign bit by comparing to
31587 // zero - a negative value will set all bits of the lanes to true
31588 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31589 SDValue Z = DAG.getConstant(0, DL, SelVT);
31590 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
31591 return DAG.getSelect(DL, SelVT, C, V0, V1);
31592 };
31593
31594 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
31595 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
31596 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31597 IsROTL = true;
31598 }
31599
31600 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
31601 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
31602
31603 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31604 // We can safely do this using i16 shifts as we're only interested in
31605 // the 3 lower bits of each byte.
31606 Amt = DAG.getBitcast(ExtVT, Amt);
31607 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
31608 Amt = DAG.getBitcast(VT, Amt);
31609
31610 // r = VSELECT(r, rot(r, 4), a);
31611 SDValue M;
31612 M = DAG.getNode(
31613 ISD::OR, DL, VT,
31614 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
31615 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
31616 R = SignBitSelect(VT, Amt, M, R);
31617
31618 // a += a
31619 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31620
31621 // r = VSELECT(r, rot(r, 2), a);
31622 M = DAG.getNode(
31623 ISD::OR, DL, VT,
31624 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
31625 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
31626 R = SignBitSelect(VT, Amt, M, R);
31627
31628 // a += a
31629 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31630
31631 // return VSELECT(r, rot(r, 1), a);
31632 M = DAG.getNode(
31633 ISD::OR, DL, VT,
31634 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
31635 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
31636 return SignBitSelect(VT, Amt, M, R);
31637 }
31638
31639 bool IsSplatAmt = DAG.isSplatValue(Amt);
31640 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
31641 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
31642
31643 // Fallback for splats + all supported variable shifts.
31644 // Fallback for non-constants AVX2 vXi16 as well.
31645 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
31646 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31647 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
31648 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
31649 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
31650 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
31651 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
31652 }
31653
31654 // Everything below assumes ISD::ROTL.
31655 if (!IsROTL) {
31656 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31657 IsROTL = true;
31658 }
31659
31660 // ISD::ROT* uses modulo rotate amounts.
31661 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31662
31663 assert(IsROTL && "Only ROTL supported");
31664
31665 // As with shifts, attempt to convert the rotation amount to a multiplication
31666 // factor, fallback to general expansion.
31667 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
31668 if (!Scale)
31669 return SDValue();
31670
31671 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
31672 if (EltSizeInBits == 16) {
31673 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
31674 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
31675 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31676 }
31677
31678 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
31679 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
31680 // that can then be OR'd with the lower 32-bits.
31681 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
31682 static const int OddMask[] = {1, 1, 3, 3};
31683 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
31684 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
31685
31686 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31687 DAG.getBitcast(MVT::v2i64, R),
31688 DAG.getBitcast(MVT::v2i64, Scale));
31689 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31690 DAG.getBitcast(MVT::v2i64, R13),
31691 DAG.getBitcast(MVT::v2i64, Scale13));
31692 Res02 = DAG.getBitcast(VT, Res02);
31693 Res13 = DAG.getBitcast(VT, Res13);
31694
31695 return DAG.getNode(ISD::OR, DL, VT,
31696 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
31697 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
31698}
31699
31700/// Returns true if the operand type is exactly twice the native width, and
31701/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
31702/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
31703/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
31704bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
31705 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
31706
31707 if (OpWidth == 64)
31708 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
31709 if (OpWidth == 128)
31710 return Subtarget.canUseCMPXCHG16B();
31711
31712 return false;
31713}
31714
31716X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
31717 Type *MemType = SI->getValueOperand()->getType();
31718
31719 if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31720 !Subtarget.useSoftFloat()) {
31721 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31722 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31724
31725 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31726 Subtarget.hasAVX())
31728 }
31729
31730 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
31732}
31733
31734// Note: this turns large loads into lock cmpxchg8b/16b.
31736X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
31737 Type *MemType = LI->getType();
31738
31739 if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31740 !Subtarget.useSoftFloat()) {
31741 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
31742 // can use movq to do the load. If we have X87 we can load into an 80-bit
31743 // X87 register and store it to a stack temporary.
31744 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31745 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31747
31748 // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
31749 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31750 Subtarget.hasAVX())
31752 }
31753
31754 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
31756}
31757
31765
31766static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
31767 using namespace llvm::PatternMatch;
31768 BitTestKind BTK = UndefBit;
31769 if (auto *C = dyn_cast<ConstantInt>(V)) {
31770 // Check if V is a power of 2 or NOT power of 2.
31771 if (isPowerOf2_64(C->getZExtValue()))
31772 BTK = ConstantBit;
31773 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
31774 BTK = NotConstantBit;
31775 return {V, BTK};
31776 }
31777
31778 // Check if V is some power of 2 pattern known to be non-zero
31779 if (auto *I = dyn_cast<Instruction>(V)) {
31780 bool Not = false;
31781 // Check if we have a NOT
31782 Value *PeekI;
31783 if (match(I, m_Not(m_Value(PeekI))) ||
31784 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
31785 Not = true;
31786 I = dyn_cast<Instruction>(PeekI);
31787
31788 // If I is constant, it will fold and we can evaluate later. If its an
31789 // argument or something of that nature, we can't analyze.
31790 if (I == nullptr)
31791 return {nullptr, UndefBit};
31792 }
31793 // We can only use 1 << X without more sophisticated analysis. C << X where
31794 // C is a power of 2 but not 1 can result in zero which cannot be translated
31795 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
31796 if (I->getOpcode() == Instruction::Shl) {
31797 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
31798 // -X` and some other provable power of 2 patterns that we can use CTZ on
31799 // may be profitable.
31800 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
31801 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
31802 // be provably a non-zero power of 2.
31803 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
31804 // transformable to bittest.
31805 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
31806 if (!ShiftVal)
31807 return {nullptr, UndefBit};
31808 if (ShiftVal->equalsInt(1))
31809 BTK = Not ? NotShiftBit : ShiftBit;
31810
31811 if (BTK == UndefBit)
31812 return {nullptr, UndefBit};
31813
31814 Value *BitV = I->getOperand(1);
31815
31816 // Read past a shiftmask instruction to find count
31817 Value *AndOp;
31818 uint64_t ShiftMask = I->getType()->getPrimitiveSizeInBits() - 1;
31819 if (match(BitV, m_c_And(m_Value(AndOp), m_SpecificInt(ShiftMask))))
31820 BitV = AndOp;
31821
31822 return {BitV, BTK};
31823 }
31824 }
31825 return {nullptr, UndefBit};
31826}
31827
31829X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
31830 using namespace llvm::PatternMatch;
31831 // If the atomicrmw's result isn't actually used, we can just add a "lock"
31832 // prefix to a normal instruction for these operations.
31833 if (AI->use_empty())
31835
31836 if (AI->getOperation() == AtomicRMWInst::Xor) {
31837 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
31838 // preferable to both `cmpxchg` and `btc`.
31839 if (match(AI->getOperand(1), m_SignMask()))
31841 }
31842
31843 // If the atomicrmw's result is used by a single bit AND, we may use
31844 // bts/btr/btc instruction for these operations.
31845 // Note: InstCombinePass can cause a de-optimization here. It replaces the
31846 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
31847 // (depending on CC). This pattern can only use bts/btr/btc but we don't
31848 // detect it.
31849 Instruction *I = AI->user_back();
31850 auto BitChange = FindSingleBitChange(AI->getValOperand());
31851 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
31852 I->getOpcode() != Instruction::And ||
31853 AI->getType()->getPrimitiveSizeInBits() == 8 ||
31854 AI->getParent() != I->getParent())
31856
31857 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
31858
31859 // This is a redundant AND, it should get cleaned up elsewhere.
31860 if (AI == I->getOperand(OtherIdx))
31862
31863 // The following instruction must be a AND single bit.
31864 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
31865 auto *C1 = cast<ConstantInt>(AI->getValOperand());
31866 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
31867 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
31869 }
31870 if (AI->getOperation() == AtomicRMWInst::And) {
31871 return ~C1->getValue() == C2->getValue()
31874 }
31877 }
31878
31879 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
31880
31881 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
31882 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
31884
31885 assert(BitChange.first != nullptr && BitTested.first != nullptr);
31886
31887 // If shift amounts are not the same we can't use BitTestIntrinsic.
31888 if (BitChange.first != BitTested.first)
31890
31891 // If atomic AND need to be masking all be one bit and testing the one bit
31892 // unset in the mask.
31893 if (AI->getOperation() == AtomicRMWInst::And)
31894 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
31897
31898 // If atomic XOR/OR need to be setting and testing the same bit.
31899 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
31902}
31903
31904void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
31905 IRBuilder<> Builder(AI);
31906 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
31909 switch (AI->getOperation()) {
31910 default:
31911 llvm_unreachable("Unknown atomic operation");
31912 case AtomicRMWInst::Or:
31913 IID_C = Intrinsic::x86_atomic_bts;
31914 IID_I = Intrinsic::x86_atomic_bts_rm;
31915 break;
31916 case AtomicRMWInst::Xor:
31917 IID_C = Intrinsic::x86_atomic_btc;
31918 IID_I = Intrinsic::x86_atomic_btc_rm;
31919 break;
31920 case AtomicRMWInst::And:
31921 IID_C = Intrinsic::x86_atomic_btr;
31922 IID_I = Intrinsic::x86_atomic_btr_rm;
31923 break;
31924 }
31925 Instruction *I = AI->user_back();
31926 LLVMContext &Ctx = AI->getContext();
31927 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31929 Value *Result = nullptr;
31930 auto BitTested = FindSingleBitChange(AI->getValOperand());
31931 assert(BitTested.first != nullptr);
31932
31933 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
31934 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
31935
31936 unsigned Imm = llvm::countr_zero(C->getZExtValue());
31937 Result = Builder.CreateIntrinsic(IID_C, AI->getType(),
31938 {Addr, Builder.getInt8(Imm)});
31939 } else {
31940 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
31941
31942 Value *SI = BitTested.first;
31943 assert(SI != nullptr);
31944
31945 // BT{S|R|C} on memory operand don't modulo bit position so we need to
31946 // mask it.
31947 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
31948 Value *BitPos =
31949 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
31950 // Todo(1): In many cases it may be provable that SI is less than
31951 // ShiftBits in which case this mask is unnecessary
31952 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
31953 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
31954 // favor of just a raw BT{S|R|C}.
31955
31956 Result = Builder.CreateIntrinsic(IID_I, AI->getType(), {Addr, BitPos});
31957 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
31958
31959 // If the result is only used for zero/non-zero status then we don't need to
31960 // shift value back. Otherwise do so.
31961 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
31962 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
31963 if (ICmp->isEquality()) {
31964 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
31965 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
31966 if (C0 || C1) {
31967 assert(C0 == nullptr || C1 == nullptr);
31968 if ((C0 ? C0 : C1)->isZero())
31969 continue;
31970 }
31971 }
31972 }
31973 Result = Builder.CreateShl(Result, BitPos);
31974 break;
31975 }
31976 }
31977
31978 I->replaceAllUsesWith(Result);
31979 I->eraseFromParent();
31980 AI->eraseFromParent();
31981}
31982
31984 using namespace llvm::PatternMatch;
31985 if (!AI->hasOneUse())
31986 return false;
31987
31988 Value *Op = AI->getOperand(1);
31989 CmpPredicate Pred;
31990 Instruction *I = AI->user_back();
31992 if (Opc == AtomicRMWInst::Add) {
31993 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
31994 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
31995 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
31996 if (match(I->user_back(),
31998 return true;
31999 if (match(I->user_back(),
32001 return true;
32002 }
32003 return false;
32004 }
32005 if (Opc == AtomicRMWInst::Sub) {
32006 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32007 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32008 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
32009 if (match(I->user_back(),
32011 return true;
32012 if (match(I->user_back(),
32014 return true;
32015 }
32016 return false;
32017 }
32018 if ((Opc == AtomicRMWInst::Or &&
32020 (Opc == AtomicRMWInst::And &&
32022 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32023 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
32024 Pred == CmpInst::ICMP_SLT;
32025 if (match(I->user_back(),
32027 return true;
32028 return false;
32029 }
32030 if (Opc == AtomicRMWInst::Xor) {
32031 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32032 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32033 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
32034 if (match(I->user_back(),
32036 return true;
32037 if (match(I->user_back(),
32039 return true;
32040 }
32041 return false;
32042 }
32043
32044 return false;
32045}
32046
32047void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
32048 AtomicRMWInst *AI) const {
32049 IRBuilder<> Builder(AI);
32050 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32051 Instruction *TempI = nullptr;
32052 LLVMContext &Ctx = AI->getContext();
32053 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
32054 if (!ICI) {
32055 TempI = AI->user_back();
32056 assert(TempI->hasOneUse() && "Must have one use");
32057 ICI = cast<ICmpInst>(TempI->user_back());
32058 }
32060 ICmpInst::Predicate Pred = ICI->getPredicate();
32061 switch (Pred) {
32062 default:
32063 llvm_unreachable("Not supported Pred");
32064 case CmpInst::ICMP_EQ:
32065 CC = X86::COND_E;
32066 break;
32067 case CmpInst::ICMP_NE:
32068 CC = X86::COND_NE;
32069 break;
32070 case CmpInst::ICMP_SLT:
32071 CC = X86::COND_S;
32072 break;
32073 case CmpInst::ICMP_SGT:
32074 CC = X86::COND_NS;
32075 break;
32076 }
32078 switch (AI->getOperation()) {
32079 default:
32080 llvm_unreachable("Unknown atomic operation");
32081 case AtomicRMWInst::Add:
32082 IID = Intrinsic::x86_atomic_add_cc;
32083 break;
32084 case AtomicRMWInst::Sub:
32085 IID = Intrinsic::x86_atomic_sub_cc;
32086 break;
32087 case AtomicRMWInst::Or:
32088 IID = Intrinsic::x86_atomic_or_cc;
32089 break;
32090 case AtomicRMWInst::And:
32091 IID = Intrinsic::x86_atomic_and_cc;
32092 break;
32093 case AtomicRMWInst::Xor:
32094 IID = Intrinsic::x86_atomic_xor_cc;
32095 break;
32096 }
32097 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
32099 Value *Call = Builder.CreateIntrinsic(
32100 IID, AI->getType(),
32101 {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
32102 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
32103 ICI->replaceAllUsesWith(Result);
32104 ICI->eraseFromParent();
32105 if (TempI)
32106 TempI->eraseFromParent();
32107 AI->eraseFromParent();
32108}
32109
32111X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
32112 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32113 Type *MemType = AI->getType();
32114
32115 // If the operand is too big, we must see if cmpxchg8/16b is available
32116 // and default to library calls otherwise.
32117 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
32118 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
32120 }
32121
32123 switch (Op) {
32126 case AtomicRMWInst::Add:
32127 case AtomicRMWInst::Sub:
32130 // It's better to use xadd, xsub or xchg for these in other cases.
32132 case AtomicRMWInst::Or:
32133 case AtomicRMWInst::And:
32134 case AtomicRMWInst::Xor:
32137 return shouldExpandLogicAtomicRMWInIR(AI);
32139 case AtomicRMWInst::Max:
32140 case AtomicRMWInst::Min:
32151 default:
32152 // These always require a non-trivial set of data operations on x86. We must
32153 // use a cmpxchg loop.
32155 }
32156}
32157
32158LoadInst *
32159X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
32160 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32161 Type *MemType = AI->getType();
32162 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
32163 // there is no benefit in turning such RMWs into loads, and it is actually
32164 // harmful as it introduces a mfence.
32165 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
32166 return nullptr;
32167
32168 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
32169 // lowering available in lowerAtomicArith.
32170 // TODO: push more cases through this path.
32171 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
32172 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
32173 AI->use_empty())
32174 return nullptr;
32175
32176 IRBuilder<> Builder(AI);
32177 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32178 auto SSID = AI->getSyncScopeID();
32179 // We must restrict the ordering to avoid generating loads with Release or
32180 // ReleaseAcquire orderings.
32182
32183 // Before the load we need a fence. Here is an example lifted from
32184 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
32185 // is required:
32186 // Thread 0:
32187 // x.store(1, relaxed);
32188 // r1 = y.fetch_add(0, release);
32189 // Thread 1:
32190 // y.fetch_add(42, acquire);
32191 // r2 = x.load(relaxed);
32192 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
32193 // lowered to just a load without a fence. A mfence flushes the store buffer,
32194 // making the optimization clearly correct.
32195 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
32196 // otherwise, we might be able to be more aggressive on relaxed idempotent
32197 // rmw. In practice, they do not look useful, so we don't try to be
32198 // especially clever.
32199
32200 // Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
32201 // lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence
32202 Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
32203
32204 // Finally we can emit the atomic load.
32205 LoadInst *Loaded = Builder.CreateAlignedLoad(
32206 AI->getType(), AI->getPointerOperand(), AI->getAlign());
32207 Loaded->setAtomic(Order, SSID);
32208 AI->replaceAllUsesWith(Loaded);
32209 AI->eraseFromParent();
32210 return Loaded;
32211}
32212
32213/// Emit a locked operation on a stack location which does not change any
32214/// memory location, but does involve a lock prefix. Location is chosen to be
32215/// a) very likely accessed only by a single thread to minimize cache traffic,
32216/// and b) definitely dereferenceable. Returns the new Chain result.
32218 const X86Subtarget &Subtarget, SDValue Chain,
32219 const SDLoc &DL) {
32220 // Implementation notes:
32221 // 1) LOCK prefix creates a full read/write reordering barrier for memory
32222 // operations issued by the current processor. As such, the location
32223 // referenced is not relevant for the ordering properties of the instruction.
32224 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
32225 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
32226 // 2) Using an immediate operand appears to be the best encoding choice
32227 // here since it doesn't require an extra register.
32228 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
32229 // is small enough it might just be measurement noise.)
32230 // 4) When choosing offsets, there are several contributing factors:
32231 // a) If there's no redzone, we default to TOS. (We could allocate a cache
32232 // line aligned stack object to improve this case.)
32233 // b) To minimize our chances of introducing a false dependence, we prefer
32234 // to offset the stack usage from TOS slightly.
32235 // c) To minimize concerns about cross thread stack usage - in particular,
32236 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
32237 // captures state in the TOS frame and accesses it from many threads -
32238 // we want to use an offset such that the offset is in a distinct cache
32239 // line from the TOS frame.
32240 //
32241 // For a general discussion of the tradeoffs and benchmark results, see:
32242 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
32243
32244 auto &MF = DAG.getMachineFunction();
32245 auto &TFL = *Subtarget.getFrameLowering();
32246 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
32247
32248 if (Subtarget.is64Bit()) {
32249 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32250 SDValue Ops[] = {
32251 DAG.getRegister(X86::RSP, MVT::i64), // Base
32252 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32253 DAG.getRegister(0, MVT::i64), // Index
32254 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32255 DAG.getRegister(0, MVT::i16), // Segment.
32256 Zero,
32257 Chain};
32258 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32259 MVT::Other, Ops);
32260 return SDValue(Res, 1);
32261 }
32262
32263 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32264 SDValue Ops[] = {
32265 DAG.getRegister(X86::ESP, MVT::i32), // Base
32266 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32267 DAG.getRegister(0, MVT::i32), // Index
32268 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32269 DAG.getRegister(0, MVT::i16), // Segment.
32270 Zero,
32271 Chain
32272 };
32273 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32274 MVT::Other, Ops);
32275 return SDValue(Res, 1);
32276}
32277
32279 SelectionDAG &DAG) {
32280 SDLoc dl(Op);
32281 AtomicOrdering FenceOrdering =
32282 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
32283 SyncScope::ID FenceSSID =
32284 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
32285
32286 // The only fence that needs an instruction is a sequentially-consistent
32287 // cross-thread fence.
32288 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
32289 FenceSSID == SyncScope::System) {
32290 if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
32291 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
32292
32293 SDValue Chain = Op.getOperand(0);
32294 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
32295 }
32296
32297 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32298 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
32299}
32300
32302 SelectionDAG &DAG) {
32303 MVT T = Op.getSimpleValueType();
32304 SDLoc DL(Op);
32305 unsigned Reg = 0;
32306 unsigned size = 0;
32307 switch(T.SimpleTy) {
32308 default: llvm_unreachable("Invalid value type!");
32309 case MVT::i8: Reg = X86::AL; size = 1; break;
32310 case MVT::i16: Reg = X86::AX; size = 2; break;
32311 case MVT::i32: Reg = X86::EAX; size = 4; break;
32312 case MVT::i64:
32313 assert(Subtarget.is64Bit() && "Node not type legal!");
32314 Reg = X86::RAX; size = 8;
32315 break;
32316 }
32317 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
32318 Op.getOperand(2), SDValue());
32319 SDValue Ops[] = { cpIn.getValue(0),
32320 Op.getOperand(1),
32321 Op.getOperand(3),
32322 DAG.getTargetConstant(size, DL, MVT::i8),
32323 cpIn.getValue(1) };
32324 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
32325 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
32327 Ops, T, MMO);
32328
32329 SDValue cpOut =
32330 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
32331 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
32332 MVT::i32, cpOut.getValue(2));
32333 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
32334
32335 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
32336 cpOut, Success, EFLAGS.getValue(1));
32337}
32338
32339// Create MOVMSKB, taking into account whether we need to split for AVX1.
32341 const X86Subtarget &Subtarget) {
32342 MVT InVT = V.getSimpleValueType();
32343
32344 if (InVT == MVT::v64i8) {
32345 SDValue Lo, Hi;
32346 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32347 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
32348 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
32349 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
32350 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
32351 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
32352 DAG.getConstant(32, DL, MVT::i8));
32353 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
32354 }
32355 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
32356 SDValue Lo, Hi;
32357 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32358 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
32359 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
32360 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
32361 DAG.getConstant(16, DL, MVT::i8));
32362 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
32363 }
32364
32365 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
32366}
32367
32368static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
32369 SelectionDAG &DAG) {
32370 SDValue Src = Op.getOperand(0);
32371 MVT SrcVT = Src.getSimpleValueType();
32372 MVT DstVT = Op.getSimpleValueType();
32373
32374 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
32375 // half to v32i1 and concatenating the result.
32376 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
32377 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
32378 assert(Subtarget.hasBWI() && "Expected BWI target");
32379 SDLoc dl(Op);
32380 SDValue Lo, Hi;
32381 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
32382 Lo = DAG.getBitcast(MVT::v32i1, Lo);
32383 Hi = DAG.getBitcast(MVT::v32i1, Hi);
32384 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
32385 }
32386
32387 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
32388 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
32389 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
32390 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
32391 SDLoc DL(Op);
32392 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
32393 V = getPMOVMSKB(DL, V, DAG, Subtarget);
32394 return DAG.getZExtOrTrunc(V, DL, DstVT);
32395 }
32396
32397 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
32398 SrcVT == MVT::i64) && "Unexpected VT!");
32399
32400 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32401 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
32402 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
32403 // This conversion needs to be expanded.
32404 return SDValue();
32405
32406 SDLoc dl(Op);
32407 if (SrcVT.isVector()) {
32408 // Widen the vector in input in the case of MVT::v2i32.
32409 // Example: from MVT::v2i32 to MVT::v4i32.
32411 SrcVT.getVectorNumElements() * 2);
32412 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
32413 DAG.getUNDEF(SrcVT));
32414 } else {
32415 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
32416 "Unexpected source type in LowerBITCAST");
32417 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
32418 }
32419
32420 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
32421 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
32422
32423 if (DstVT == MVT::x86mmx)
32424 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
32425
32426 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
32427 DAG.getVectorIdxConstant(0, dl));
32428}
32429
32430/// Compute the horizontal sum of bytes in V for the elements of VT.
32431///
32432/// Requires V to be a byte vector and VT to be an integer vector type with
32433/// wider elements than V's type. The width of the elements of VT determines
32434/// how many bytes of V are summed horizontally to produce each element of the
32435/// result.
32437 const X86Subtarget &Subtarget,
32438 SelectionDAG &DAG) {
32439 SDLoc DL(V);
32440 MVT ByteVecVT = V.getSimpleValueType();
32441 MVT EltVT = VT.getVectorElementType();
32442 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
32443 "Expected value to have byte element type.");
32444 assert(EltVT != MVT::i8 &&
32445 "Horizontal byte sum only makes sense for wider elements!");
32446 unsigned VecSize = VT.getSizeInBits();
32447 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
32448
32449 // PSADBW instruction horizontally add all bytes and leave the result in i64
32450 // chunks, thus directly computes the pop count for v2i64 and v4i64.
32451 if (EltVT == MVT::i64) {
32452 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
32453 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32454 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
32455 return DAG.getBitcast(VT, V);
32456 }
32457
32458 if (EltVT == MVT::i32) {
32459 // We unpack the low half and high half into i32s interleaved with zeros so
32460 // that we can use PSADBW to horizontally sum them. The most useful part of
32461 // this is that it lines up the results of two PSADBW instructions to be
32462 // two v2i64 vectors which concatenated are the 4 population counts. We can
32463 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
32464 SDValue Zeros = DAG.getConstant(0, DL, VT);
32465 SDValue V32 = DAG.getBitcast(VT, V);
32466 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
32467 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
32468
32469 // Do the horizontal sums into two v2i64s.
32470 Zeros = DAG.getConstant(0, DL, ByteVecVT);
32471 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32472 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32473 DAG.getBitcast(ByteVecVT, Low), Zeros);
32474 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32475 DAG.getBitcast(ByteVecVT, High), Zeros);
32476
32477 // Merge them together.
32478 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
32479 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
32480 DAG.getBitcast(ShortVecVT, Low),
32481 DAG.getBitcast(ShortVecVT, High));
32482
32483 return DAG.getBitcast(VT, V);
32484 }
32485
32486 // The only element type left is i16.
32487 assert(EltVT == MVT::i16 && "Unknown how to handle type");
32488
32489 // To obtain pop count for each i16 element starting from the pop count for
32490 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
32491 // right by 8. It is important to shift as i16s as i8 vector shift isn't
32492 // directly supported.
32493 SDValue ShifterV = DAG.getConstant(8, DL, VT);
32494 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32495 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
32496 DAG.getBitcast(ByteVecVT, V));
32497 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32498}
32499
32501 const X86Subtarget &Subtarget,
32502 SelectionDAG &DAG) {
32503 MVT VT = Op.getSimpleValueType();
32504 MVT EltVT = VT.getVectorElementType();
32505 int NumElts = VT.getVectorNumElements();
32506 (void)EltVT;
32507 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
32508
32509 // Implement a lookup table in register by using an algorithm based on:
32510 // http://wm.ite.pl/articles/sse-popcount.html
32511 //
32512 // The general idea is that every lower byte nibble in the input vector is an
32513 // index into a in-register pre-computed pop count table. We then split up the
32514 // input vector in two new ones: (1) a vector with only the shifted-right
32515 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
32516 // masked out higher ones) for each byte. PSHUFB is used separately with both
32517 // to index the in-register table. Next, both are added and the result is a
32518 // i8 vector where each element contains the pop count for input byte.
32519 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
32520 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
32521 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
32522 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
32523
32525 for (int i = 0; i < NumElts; ++i)
32526 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
32527 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
32528 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
32529
32530 // High nibbles
32531 SDValue FourV = DAG.getConstant(4, DL, VT);
32532 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
32533
32534 // Low nibbles
32535 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
32536
32537 // The input vector is used as the shuffle mask that index elements into the
32538 // LUT. After counting low and high nibbles, add the vector to obtain the
32539 // final pop count per i8 element.
32540 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
32541 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
32542 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
32543}
32544
32545// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
32546// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
32548 const X86Subtarget &Subtarget,
32549 SelectionDAG &DAG) {
32550 MVT VT = Op.getSimpleValueType();
32551 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
32552 "Unknown CTPOP type to handle");
32553 SDValue Op0 = Op.getOperand(0);
32554
32555 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
32556 if (Subtarget.hasVPOPCNTDQ()) {
32557 unsigned NumElems = VT.getVectorNumElements();
32558 assert((VT.getVectorElementType() == MVT::i8 ||
32559 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
32560 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
32561 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
32562 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
32563 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
32564 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
32565 }
32566 }
32567
32568 // Decompose 256-bit ops into smaller 128-bit ops.
32569 if (VT.is256BitVector() && !Subtarget.hasInt256())
32570 return splitVectorIntUnary(Op, DAG, DL);
32571
32572 // Decompose 512-bit ops into smaller 256-bit ops.
32573 if (VT.is512BitVector() && !Subtarget.hasBWI())
32574 return splitVectorIntUnary(Op, DAG, DL);
32575
32576 // For element types greater than i8, do vXi8 pop counts and a bytesum.
32577 if (VT.getScalarType() != MVT::i8) {
32578 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32579 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
32580 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
32581 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
32582 }
32583
32584 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
32585 if (!Subtarget.hasSSSE3())
32586 return SDValue();
32587
32588 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
32589}
32590
32591static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
32592 SelectionDAG &DAG) {
32593 MVT VT = N.getSimpleValueType();
32594 SDValue Op = N.getOperand(0);
32595 SDLoc DL(N);
32596
32597 if (VT.isScalarInteger()) {
32598 // Compute the lower/upper bounds of the active bits of the value,
32599 // allowing us to shift the active bits down if necessary to fit into the
32600 // special cases below.
32601 KnownBits Known = DAG.computeKnownBits(Op);
32602 if (Known.isConstant())
32603 return DAG.getConstant(Known.getConstant().popcount(), DL, VT);
32604 unsigned LZ = Known.countMinLeadingZeros();
32605 unsigned TZ = Known.countMinTrailingZeros();
32606 assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask");
32607 unsigned ActiveBits = Known.getBitWidth() - LZ;
32608 unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
32609
32610 // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
32611 if (ShiftedActiveBits <= 2) {
32612 if (ActiveBits > 2)
32613 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32614 DAG.getShiftAmountConstant(TZ, VT, DL));
32615 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32616 Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op,
32617 DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32618 DAG.getShiftAmountConstant(1, VT, DL)));
32619 return DAG.getZExtOrTrunc(Op, DL, VT);
32620 }
32621
32622 // i3 CTPOP - perform LUT into i32 integer.
32623 if (ShiftedActiveBits <= 3) {
32624 if (ActiveBits > 3)
32625 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32626 DAG.getShiftAmountConstant(TZ, VT, DL));
32627 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32628 Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op,
32629 DAG.getShiftAmountConstant(1, VT, DL));
32630 Op = DAG.getNode(ISD::SRL, DL, MVT::i32,
32631 DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op);
32632 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op,
32633 DAG.getConstant(0x3, DL, MVT::i32));
32634 return DAG.getZExtOrTrunc(Op, DL, VT);
32635 }
32636
32637 // i4 CTPOP - perform LUT into i64 integer.
32638 if (ShiftedActiveBits <= 4 &&
32639 DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) {
32640 SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64);
32641 if (ActiveBits > 4)
32642 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32643 DAG.getShiftAmountConstant(TZ, VT, DL));
32644 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32645 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32646 DAG.getConstant(4, DL, MVT::i32));
32647 Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT,
32648 DAG.getShiftAmountOperand(MVT::i64, Op));
32649 Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op,
32650 DAG.getConstant(0x7, DL, MVT::i64));
32651 return DAG.getZExtOrTrunc(Op, DL, VT);
32652 }
32653
32654 // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
32655 if (ShiftedActiveBits <= 8) {
32656 SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
32657 if (ActiveBits > 8)
32658 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32659 DAG.getShiftAmountConstant(TZ, VT, DL));
32660 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32661 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32662 DAG.getConstant(0x08040201U, DL, MVT::i32));
32663 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32664 DAG.getShiftAmountConstant(3, MVT::i32, DL));
32665 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
32666 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
32667 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32668 DAG.getShiftAmountConstant(28, MVT::i32, DL));
32669 return DAG.getZExtOrTrunc(Op, DL, VT);
32670 }
32671
32672 return SDValue(); // fallback to generic expansion.
32673 }
32674
32675 assert(VT.isVector() &&
32676 "We only do custom lowering for vector population count.");
32677 return LowerVectorCTPOP(N, DL, Subtarget, DAG);
32678}
32679
32681 MVT VT = Op.getSimpleValueType();
32682 SDValue In = Op.getOperand(0);
32683 SDLoc DL(Op);
32684
32685 // For scalars, its still beneficial to transfer to/from the SIMD unit to
32686 // perform the BITREVERSE.
32687 if (!VT.isVector()) {
32688 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32689 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32690 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
32691 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
32692 DAG.getVectorIdxConstant(0, DL));
32693 }
32694
32695 int NumElts = VT.getVectorNumElements();
32696 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
32697
32698 // Decompose 256-bit ops into smaller 128-bit ops.
32699 if (VT.is256BitVector())
32700 return splitVectorIntUnary(Op, DAG, DL);
32701
32702 assert(VT.is128BitVector() &&
32703 "Only 128-bit vector bitreverse lowering supported.");
32704
32705 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
32706 // perform the BSWAP in the shuffle.
32707 // Its best to shuffle using the second operand as this will implicitly allow
32708 // memory folding for multiple vectors.
32709 SmallVector<SDValue, 16> MaskElts;
32710 for (int i = 0; i != NumElts; ++i) {
32711 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
32712 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
32713 int PermuteByte = SourceByte | (2 << 5);
32714 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
32715 }
32716 }
32717
32718 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
32719 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
32720 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
32721 Res, Mask);
32722 return DAG.getBitcast(VT, Res);
32723}
32724
32726 SelectionDAG &DAG) {
32727 MVT VT = Op.getSimpleValueType();
32728
32729 if (Subtarget.hasXOP() && !VT.is512BitVector())
32730 return LowerBITREVERSE_XOP(Op, DAG);
32731
32732 assert((Subtarget.hasSSSE3() || Subtarget.hasGFNI()) &&
32733 "SSSE3 or GFNI required for BITREVERSE");
32734
32735 SDValue In = Op.getOperand(0);
32736 SDLoc DL(Op);
32737
32738 // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
32739 if (VT.is512BitVector() && !Subtarget.hasBWI())
32740 return splitVectorIntUnary(Op, DAG, DL);
32741
32742 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
32743 if (VT.is256BitVector() && !Subtarget.hasInt256())
32744 return splitVectorIntUnary(Op, DAG, DL);
32745
32746 // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP
32747 if (!VT.isVector()) {
32748 assert(
32749 (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&
32750 "Only tested for i8/i16/i32/i64");
32751 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32752 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32753 Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
32754 DAG.getBitcast(MVT::v16i8, Res));
32755 Res =
32756 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, DAG.getBitcast(VecVT, Res),
32757 DAG.getVectorIdxConstant(0, DL));
32758 return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);
32759 }
32760
32761 assert(VT.isVector() && VT.getSizeInBits() >= 128);
32762
32763 // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
32764 if (VT.getScalarType() != MVT::i8) {
32765 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32766 SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);
32767 Res = DAG.getBitcast(ByteVT, Res);
32768 Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);
32769 return DAG.getBitcast(VT, Res);
32770 }
32771 assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&
32772 "Only byte vector BITREVERSE supported");
32773
32774 unsigned NumElts = VT.getVectorNumElements();
32775
32776 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
32777 if (Subtarget.hasGFNI()) {
32779 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
32780 DAG.getTargetConstant(0, DL, MVT::i8));
32781 }
32782
32783 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
32784 // two nibbles and a PSHUFB lookup to find the bitreverse of each
32785 // 0-15 value (moved to the other nibble).
32786 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
32787 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
32788 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
32789
32790 const int LoLUT[16] = {
32791 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
32792 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
32793 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
32794 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
32795 const int HiLUT[16] = {
32796 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
32797 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
32798 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
32799 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
32800
32801 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
32802 for (unsigned i = 0; i < NumElts; ++i) {
32803 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
32804 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
32805 }
32806
32807 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
32808 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
32809 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
32810 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
32811 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
32812}
32813
32814static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
32815 SelectionDAG &DAG) {
32816 SDLoc DL(Op);
32817 SDValue X = Op.getOperand(0);
32818 MVT VT = Op.getSimpleValueType();
32819
32820 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
32821 if (VT == MVT::i8 ||
32823 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32824 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
32825 DAG.getConstant(0, DL, MVT::i8));
32826 // Copy the inverse of the parity flag into a register with setcc.
32827 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32828 // Extend to the original type.
32829 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32830 }
32831
32832 // If we have POPCNT, use the default expansion.
32833 if (Subtarget.hasPOPCNT())
32834 return SDValue();
32835
32836 if (VT == MVT::i64) {
32837 // Xor the high and low 16-bits together using a 32-bit operation.
32838 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
32839 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
32840 DAG.getConstant(32, DL, MVT::i8)));
32841 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
32842 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
32843 }
32844
32845 if (VT != MVT::i16) {
32846 // Xor the high and low 16-bits together using a 32-bit operation.
32847 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
32848 DAG.getConstant(16, DL, MVT::i8));
32849 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
32850 } else {
32851 // If the input is 16-bits, we need to extend to use an i32 shift below.
32852 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
32853 }
32854
32855 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
32856 // This should allow an h-reg to be used to save a shift.
32857 SDValue Hi = DAG.getNode(
32858 ISD::TRUNCATE, DL, MVT::i8,
32859 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
32860 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32861 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
32862 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
32863
32864 // Copy the inverse of the parity flag into a register with setcc.
32865 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32866 // Extend to the original type.
32867 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32868}
32869
32871 const X86Subtarget &Subtarget) {
32872 unsigned NewOpc = 0;
32873 switch (N->getOpcode()) {
32874 case ISD::ATOMIC_LOAD_ADD:
32875 NewOpc = X86ISD::LADD;
32876 break;
32877 case ISD::ATOMIC_LOAD_SUB:
32878 NewOpc = X86ISD::LSUB;
32879 break;
32880 case ISD::ATOMIC_LOAD_OR:
32881 NewOpc = X86ISD::LOR;
32882 break;
32883 case ISD::ATOMIC_LOAD_XOR:
32884 NewOpc = X86ISD::LXOR;
32885 break;
32886 case ISD::ATOMIC_LOAD_AND:
32887 NewOpc = X86ISD::LAND;
32888 break;
32889 default:
32890 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
32891 }
32892
32893 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
32894
32895 return DAG.getMemIntrinsicNode(
32896 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
32897 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
32898 /*MemVT=*/N->getSimpleValueType(0), MMO);
32899}
32900
32901/// Lower atomic_load_ops into LOCK-prefixed operations.
32903 const X86Subtarget &Subtarget) {
32904 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
32905 SDValue Chain = N->getOperand(0);
32906 SDValue LHS = N->getOperand(1);
32907 SDValue RHS = N->getOperand(2);
32908 unsigned Opc = N->getOpcode();
32909 MVT VT = N->getSimpleValueType(0);
32910 SDLoc DL(N);
32911
32912 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
32913 // can only be lowered when the result is unused. They should have already
32914 // been transformed into a cmpxchg loop in AtomicExpand.
32915 if (N->hasAnyUseOfValue(0)) {
32916 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
32917 // select LXADD if LOCK_SUB can't be selected.
32918 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
32919 // can use LXADD as opposed to cmpxchg.
32920 if (Opc == ISD::ATOMIC_LOAD_SUB ||
32921 (Opc == ISD::ATOMIC_LOAD_XOR && isMinSignedConstant(RHS)))
32922 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
32923 DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
32924
32925 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
32926 "Used AtomicRMW ops other than Add should have been expanded!");
32927 return N;
32928 }
32929
32930 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
32931 // The core idea here is that since the memory location isn't actually
32932 // changing, all we need is a lowering for the *ordering* impacts of the
32933 // atomicrmw. As such, we can chose a different operation and memory
32934 // location to minimize impact on other code.
32935 // The above holds unless the node is marked volatile in which
32936 // case it needs to be preserved according to the langref.
32937 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
32938 // On X86, the only ordering which actually requires an instruction is
32939 // seq_cst which isn't SingleThread, everything just needs to be preserved
32940 // during codegen and then dropped. Note that we expect (but don't assume),
32941 // that orderings other than seq_cst and acq_rel have been canonicalized to
32942 // a store or load.
32945 // Prefer a locked operation against a stack location to minimize cache
32946 // traffic. This assumes that stack locations are very likely to be
32947 // accessed only by the owning thread.
32948 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
32949 assert(!N->hasAnyUseOfValue(0));
32950 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32951 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32952 DAG.getUNDEF(VT), NewChain);
32953 }
32954 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32955 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
32956 assert(!N->hasAnyUseOfValue(0));
32957 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32958 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32959 DAG.getUNDEF(VT), NewChain);
32960 }
32961
32962 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
32963 // RAUW the chain, but don't worry about the result, as it's unused.
32964 assert(!N->hasAnyUseOfValue(0));
32965 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32966 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32967 DAG.getUNDEF(VT), LockOp.getValue(1));
32968}
32969
32971 const X86Subtarget &Subtarget) {
32972 auto *Node = cast<AtomicSDNode>(Op.getNode());
32973 SDLoc dl(Node);
32974 EVT VT = Node->getMemoryVT();
32975
32976 bool IsSeqCst =
32977 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
32978 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
32979
32980 // If this store is not sequentially consistent and the type is legal
32981 // we can just keep it.
32982 if (!IsSeqCst && IsTypeLegal)
32983 return Op;
32984
32985 if (!IsTypeLegal && !Subtarget.useSoftFloat() &&
32987 Attribute::NoImplicitFloat)) {
32988 SDValue Chain;
32989 // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a
32990 // vector store.
32991 if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) {
32992 SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
32993 Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
32994 Node->getMemOperand());
32995 }
32996
32997 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
32998 // is enabled.
32999 if (VT == MVT::i64) {
33000 if (Subtarget.hasSSE1()) {
33001 SDValue SclToVec =
33002 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
33003 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
33004 SclToVec = DAG.getBitcast(StVT, SclToVec);
33005 SDVTList Tys = DAG.getVTList(MVT::Other);
33006 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
33007 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
33008 MVT::i64, Node->getMemOperand());
33009 } else if (Subtarget.hasX87()) {
33010 // First load this into an 80-bit X87 register using a stack temporary.
33011 // This will put the whole integer into the significand.
33012 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
33013 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
33014 MachinePointerInfo MPI =
33016 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
33018 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
33019 SDValue LdOps[] = {Chain, StackPtr};
33021 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
33022 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
33023 Chain = Value.getValue(1);
33024
33025 // Now use an FIST to do the atomic store.
33026 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
33027 Chain =
33028 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
33029 StoreOps, MVT::i64, Node->getMemOperand());
33030 }
33031 }
33032
33033 if (Chain) {
33034 // If this is a sequentially consistent store, also emit an appropriate
33035 // barrier.
33036 if (IsSeqCst)
33037 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
33038
33039 return Chain;
33040 }
33041 }
33042
33043 // Convert seq_cst store -> xchg
33044 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
33045 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
33046 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
33047 Node->getOperand(0), Node->getOperand(2),
33048 Node->getOperand(1), Node->getMemOperand());
33049 return Swap.getValue(1);
33050}
33051
33053 SDNode *N = Op.getNode();
33054 MVT VT = N->getSimpleValueType(0);
33055 unsigned Opc = Op.getOpcode();
33056
33057 // Let legalize expand this if it isn't a legal type yet.
33058 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33059 return SDValue();
33060
33061 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
33062 SDLoc DL(N);
33063
33064 // Set the carry flag.
33065 SDValue Carry = Op.getOperand(2);
33066 EVT CarryVT = Carry.getValueType();
33067 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
33068 Carry, DAG.getAllOnesConstant(DL, CarryVT));
33069
33070 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
33071 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
33072 Op.getOperand(0), Op.getOperand(1),
33073 Carry.getValue(1));
33074
33075 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
33076 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
33077 Sum.getValue(1), DL, DAG);
33078 if (N->getValueType(1) == MVT::i1)
33079 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
33080
33081 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
33082}
33083
33084static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
33085 SelectionDAG &DAG) {
33086 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
33087
33088 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
33089 // which returns the values as { float, float } (in XMM0) or
33090 // { double, double } (which is returned in XMM0, XMM1).
33091 SDLoc dl(Op);
33092 SDValue Arg = Op.getOperand(0);
33093 EVT ArgVT = Arg.getValueType();
33094 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
33095
33097 Args.emplace_back(Arg, ArgTy);
33098
33099 bool isF64 = ArgVT == MVT::f64;
33100 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
33101 // the small struct {f32, f32} is returned in (eax, edx). For f64,
33102 // the results are returned via SRet in memory.
33103 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33104 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
33105 const char *LibcallName = TLI.getLibcallName(LC);
33106 SDValue Callee =
33107 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
33108
33109 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
33110 : (Type *)FixedVectorType::get(ArgTy, 4);
33111
33113 CLI.setDebugLoc(dl)
33114 .setChain(DAG.getEntryNode())
33115 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
33116
33117 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
33118
33119 if (isF64)
33120 // Returned in xmm0 and xmm1.
33121 return CallResult.first;
33122
33123 // Returned in bits 0:31 and 32:64 xmm0.
33124 SDValue SinVal =
33125 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33126 DAG.getVectorIdxConstant(0, dl));
33127 SDValue CosVal =
33128 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33129 DAG.getVectorIdxConstant(1, dl));
33130 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
33131 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
33132}
33133
33134/// Widen a vector input to a vector of NVT. The
33135/// input vector must have the same element type as NVT.
33137 bool FillWithZeroes = false) {
33138 // Check if InOp already has the right width.
33139 MVT InVT = InOp.getSimpleValueType();
33140 if (InVT == NVT)
33141 return InOp;
33142
33143 if (InOp.isUndef())
33144 return DAG.getUNDEF(NVT);
33145
33147 "input and widen element type must match");
33148
33149 unsigned InNumElts = InVT.getVectorNumElements();
33150 unsigned WidenNumElts = NVT.getVectorNumElements();
33151 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
33152 "Unexpected request for vector widening");
33153
33154 SDLoc dl(InOp);
33155 if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) {
33156 SDValue N1 = InOp.getOperand(1);
33157 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
33158 N1.isUndef()) {
33159 InOp = InOp.getOperand(0);
33160 InVT = InOp.getSimpleValueType();
33161 InNumElts = InVT.getVectorNumElements();
33162 }
33163 }
33166 EVT EltVT = InOp.getOperand(0).getValueType();
33167 SDValue FillVal =
33168 FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT);
33170 Ops.append(WidenNumElts - InNumElts, FillVal);
33171 return DAG.getBuildVector(NVT, dl, Ops);
33172 }
33173 SDValue FillVal =
33174 FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT);
33175 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, InOp,
33176 DAG.getVectorIdxConstant(0, dl));
33177}
33178
33180 SelectionDAG &DAG) {
33181 assert(Subtarget.hasAVX512() &&
33182 "MGATHER/MSCATTER are supported on AVX-512 arch only");
33183
33185 SDValue Src = N->getValue();
33186 MVT VT = Src.getSimpleValueType();
33187 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
33188 SDLoc dl(Op);
33189
33190 SDValue Scale = N->getScale();
33191 SDValue Index = N->getIndex();
33192 SDValue Mask = N->getMask();
33193 SDValue Chain = N->getChain();
33194 SDValue BasePtr = N->getBasePtr();
33195
33196 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
33197 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
33198 // If the index is v2i64 and we have VLX we can use xmm for data and index.
33199 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
33200 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33201 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
33202 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
33203 SDVTList VTs = DAG.getVTList(MVT::Other);
33204 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33205 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33206 N->getMemoryVT(), N->getMemOperand());
33207 }
33208 return SDValue();
33209 }
33210
33211 MVT IndexVT = Index.getSimpleValueType();
33212
33213 // If the index is v2i32, we're being called by type legalization and we
33214 // should just let the default handling take care of it.
33215 if (IndexVT == MVT::v2i32)
33216 return SDValue();
33217
33218 // If we don't have VLX and neither the passthru or index is 512-bits, we
33219 // need to widen until one is.
33220 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
33221 !Index.getSimpleValueType().is512BitVector()) {
33222 // Determine how much we need to widen by to get a 512-bit type.
33223 unsigned Factor = std::min(512/VT.getSizeInBits(),
33224 512/IndexVT.getSizeInBits());
33225 unsigned NumElts = VT.getVectorNumElements() * Factor;
33226
33227 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33228 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33229 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33230
33231 Src = ExtendToType(Src, VT, DAG);
33232 Index = ExtendToType(Index, IndexVT, DAG);
33233 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33234 }
33235
33236 SDVTList VTs = DAG.getVTList(MVT::Other);
33237 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33238 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33239 N->getMemoryVT(), N->getMemOperand());
33240}
33241
33242static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
33243 SelectionDAG &DAG) {
33244
33246 MVT VT = Op.getSimpleValueType();
33247 MVT ScalarVT = VT.getScalarType();
33248 SDValue Mask = N->getMask();
33249 MVT MaskVT = Mask.getSimpleValueType();
33250 SDValue PassThru = N->getPassThru();
33251 SDLoc dl(Op);
33252
33253 // Handle AVX masked loads which don't support passthru other than 0.
33254 if (MaskVT.getVectorElementType() != MVT::i1) {
33255 // We also allow undef in the isel pattern.
33256 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
33257 return Op;
33258
33259 SDValue NewLoad = DAG.getMaskedLoad(
33260 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33261 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
33262 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
33263 N->isExpandingLoad());
33264 // Emit a blend.
33265 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
33266 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
33267 }
33268
33269 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
33270 "Expanding masked load is supported on AVX-512 target only!");
33271
33272 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
33273 "Expanding masked load is supported for 32 and 64-bit types only!");
33274
33275 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33276 "Cannot lower masked load op.");
33277
33278 assert((ScalarVT.getSizeInBits() >= 32 ||
33279 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33280 ScalarVT == MVT::f16))) &&
33281 "Unsupported masked load op.");
33282
33283 // This operation is legal for targets with VLX, but without
33284 // VLX the vector should be widened to 512 bit
33285 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
33286 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33287 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
33288
33289 // Mask element has to be i1.
33290 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33291 "Unexpected mask type");
33292
33293 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33294
33295 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33296 SDValue NewLoad = DAG.getMaskedLoad(
33297 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33298 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
33299 N->getExtensionType(), N->isExpandingLoad());
33300
33301 SDValue Extract =
33302 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
33303 DAG.getVectorIdxConstant(0, dl));
33304 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
33305 return DAG.getMergeValues(RetOps, dl);
33306}
33307
33308static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
33309 SelectionDAG &DAG) {
33311 SDValue DataToStore = N->getValue();
33312 MVT VT = DataToStore.getSimpleValueType();
33313 MVT ScalarVT = VT.getScalarType();
33314 SDValue Mask = N->getMask();
33315 SDLoc dl(Op);
33316
33317 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
33318 "Expanding masked load is supported on AVX-512 target only!");
33319
33320 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
33321 "Expanding masked load is supported for 32 and 64-bit types only!");
33322
33323 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33324 "Cannot lower masked store op.");
33325
33326 assert((ScalarVT.getSizeInBits() >= 32 ||
33327 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33328 ScalarVT == MVT::f16))) &&
33329 "Unsupported masked store op.");
33330
33331 // This operation is legal for targets with VLX, but without
33332 // VLX the vector should be widened to 512 bit
33333 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
33334 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33335
33336 // Mask element has to be i1.
33337 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33338 "Unexpected mask type");
33339
33340 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33341
33342 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
33343 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33344 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
33345 N->getOffset(), Mask, N->getMemoryVT(),
33346 N->getMemOperand(), N->getAddressingMode(),
33347 N->isTruncatingStore(), N->isCompressingStore());
33348}
33349
33350static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
33351 SelectionDAG &DAG) {
33352 assert(Subtarget.hasAVX2() &&
33353 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
33354
33356 SDLoc dl(Op);
33357 MVT VT = Op.getSimpleValueType();
33358 SDValue Index = N->getIndex();
33359 SDValue Mask = N->getMask();
33360 SDValue PassThru = N->getPassThru();
33361 MVT IndexVT = Index.getSimpleValueType();
33362
33363 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
33364
33365 // If the index is v2i32, we're being called by type legalization.
33366 if (IndexVT == MVT::v2i32)
33367 return SDValue();
33368
33369 // If we don't have VLX and neither the passthru or index is 512-bits, we
33370 // need to widen until one is.
33371 MVT OrigVT = VT;
33372 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33373 !IndexVT.is512BitVector()) {
33374 // Determine how much we need to widen by to get a 512-bit type.
33375 unsigned Factor = std::min(512/VT.getSizeInBits(),
33376 512/IndexVT.getSizeInBits());
33377
33378 unsigned NumElts = VT.getVectorNumElements() * Factor;
33379
33380 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33381 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33382 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33383
33384 PassThru = ExtendToType(PassThru, VT, DAG);
33385 Index = ExtendToType(Index, IndexVT, DAG);
33386 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33387 }
33388
33389 // Break dependency on the data register.
33390 if (PassThru.isUndef())
33391 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
33392
33393 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
33394 N->getScale() };
33395 SDValue NewGather = DAG.getMemIntrinsicNode(
33396 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
33397 N->getMemOperand());
33398 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT, NewGather,
33399 DAG.getVectorIdxConstant(0, dl));
33400 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
33401}
33402
33404 SDLoc dl(Op);
33405 SDValue Src = Op.getOperand(0);
33406 MVT DstVT = Op.getSimpleValueType();
33407
33409 unsigned SrcAS = N->getSrcAddressSpace();
33410
33411 assert(SrcAS != N->getDestAddressSpace() &&
33412 "addrspacecast must be between different address spaces");
33413
33414 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
33415 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
33416 } else if (DstVT == MVT::i64) {
33417 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
33418 } else if (DstVT == MVT::i32) {
33419 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
33420 } else {
33421 report_fatal_error("Bad address space in addrspacecast");
33422 }
33423 return Op;
33424}
33425
33426SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
33427 SelectionDAG &DAG) const {
33428 // TODO: Eventually, the lowering of these nodes should be informed by or
33429 // deferred to the GC strategy for the function in which they appear. For
33430 // now, however, they must be lowered to something. Since they are logically
33431 // no-ops in the case of a null GC strategy (or a GC strategy which does not
33432 // require special handling for these nodes), lower them as literal NOOPs for
33433 // the time being.
33435 Ops.push_back(Op.getOperand(0));
33436 if (Op->getGluedNode())
33437 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
33438
33439 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
33440 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
33441}
33442
33443// Custom split CVTPS2PH with wide types.
33445 SDLoc dl(Op);
33446 EVT VT = Op.getValueType();
33447 SDValue Lo, Hi;
33448 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
33449 EVT LoVT, HiVT;
33450 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33451 SDValue RC = Op.getOperand(1);
33452 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
33453 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
33454 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33455}
33456
33458 SelectionDAG &DAG) {
33459 unsigned IsData = Op.getConstantOperandVal(4);
33460
33461 // We don't support non-data prefetch without PREFETCHI.
33462 // Just preserve the chain.
33463 if (!IsData && !Subtarget.hasPREFETCHI())
33464 return Op.getOperand(0);
33465
33466 return Op;
33467}
33468
33470 SDNode *N = Op.getNode();
33471 SDValue Operand = N->getOperand(0);
33472 EVT VT = Operand.getValueType();
33473 SDLoc dl(N);
33474
33475 SDValue One = DAG.getConstantFP(1.0, dl, VT);
33476
33477 // TODO: Fix Crash for bf16 when generating strict_fmul as it
33478 // leads to a error : SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0,
33479 // ConstantFP:bf16<APFloat(16256)>, t5 LLVM ERROR: Do not know how to soft
33480 // promote this operator's result!
33481 SDValue Chain = DAG.getEntryNode();
33482 SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
33483 {Chain, Operand, One});
33484 return StrictFmul;
33485}
33486
33488 unsigned OpNo) {
33489 const APInt Operand(32, OpNo);
33490 std::string OpNoStr = llvm::toString(Operand, 10, false);
33491 std::string Str(" $");
33492
33493 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
33494 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
33495
33496 auto I = StringRef::npos;
33497 for (auto &AsmStr : AsmStrs) {
33498 // Match the OpNo string. We should match exactly to exclude match
33499 // sub-string, e.g. "$12" contain "$1"
33500 if (AsmStr.ends_with(OpNoStr1))
33501 I = AsmStr.size() - OpNoStr1.size();
33502
33503 // Get the index of operand in AsmStr.
33504 if (I == StringRef::npos)
33505 I = AsmStr.find(OpNoStr1 + ",");
33506 if (I == StringRef::npos)
33507 I = AsmStr.find(OpNoStr2);
33508
33509 if (I == StringRef::npos)
33510 continue;
33511
33512 assert(I > 0 && "Unexpected inline asm string!");
33513 // Remove the operand string and label (if exsit).
33514 // For example:
33515 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
33516 // ==>
33517 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
33518 // ==>
33519 // "call dword ptr "
33520 auto TmpStr = AsmStr.substr(0, I);
33521 I = TmpStr.rfind(':');
33522 if (I != StringRef::npos)
33523 TmpStr = TmpStr.substr(I + 1);
33524 return TmpStr.take_while(llvm::isAlpha);
33525 }
33526
33527 return StringRef();
33528}
33529
33531 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
33532 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
33533 // changed from indirect TargetLowering::C_Memory to direct
33534 // TargetLowering::C_Address.
33535 // We don't need to special case LOOP* and Jcc, which cannot target a memory
33536 // location.
33537 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
33538 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
33539}
33540
33542 SDValue Mask) {
33543 EVT Ty = MVT::i8;
33544 auto V = DAG.getBitcast(MVT::i1, Mask);
33545 auto VE = DAG.getZExtOrTrunc(V, DL, Ty);
33546 auto Zero = DAG.getConstant(0, DL, Ty);
33547 SDVTList X86SubVTs = DAG.getVTList(Ty, MVT::i32);
33548 auto CmpZero = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, VE);
33549 return SDValue(CmpZero.getNode(), 1);
33550}
33551
33553 SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
33554 SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {
33555 // @llvm.masked.load.v1*(ptr, alignment, mask, passthru)
33556 // ->
33557 // _, flags = SUB 0, mask
33558 // res, chain = CLOAD inchain, ptr, (bit_cast_to_scalar passthru), cond, flags
33559 // bit_cast_to_vector<res>
33560 EVT VTy = PassThru.getValueType();
33561 EVT Ty = VTy.getVectorElementType();
33562 SDVTList Tys = DAG.getVTList(Ty, MVT::Other);
33563 auto ScalarPassThru = PassThru.isUndef() ? DAG.getConstant(0, DL, Ty)
33564 : DAG.getBitcast(Ty, PassThru);
33565 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33566 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33567 SDValue Ops[] = {Chain, Ptr, ScalarPassThru, COND_NE, Flags};
33568 NewLoad = DAG.getMemIntrinsicNode(X86ISD::CLOAD, DL, Tys, Ops, Ty, MMO);
33569 return DAG.getBitcast(VTy, NewLoad);
33570}
33571
33573 SDValue Chain,
33575 SDValue Val, SDValue Mask) const {
33576 // llvm.masked.store.v1*(Src0, Ptr, alignment, Mask)
33577 // ->
33578 // _, flags = SUB 0, mask
33579 // chain = CSTORE inchain, (bit_cast_to_scalar val), ptr, cond, flags
33581 SDVTList Tys = DAG.getVTList(MVT::Other);
33582 auto ScalarVal = DAG.getBitcast(Ty, Val);
33583 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33584 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33585 SDValue Ops[] = {Chain, ScalarVal, Ptr, COND_NE, Flags};
33586 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, DL, Tys, Ops, Ty, MMO);
33587}
33588
33589/// Provide custom lowering hooks for some operations.
33591 switch (Op.getOpcode()) {
33592 // clang-format off
33593 default: llvm_unreachable("Should not custom lower this!");
33594 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
33595 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
33596 return LowerCMP_SWAP(Op, Subtarget, DAG);
33597 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
33598 case ISD::ATOMIC_LOAD_ADD:
33599 case ISD::ATOMIC_LOAD_SUB:
33600 case ISD::ATOMIC_LOAD_OR:
33601 case ISD::ATOMIC_LOAD_XOR:
33602 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
33603 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
33604 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
33605 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
33606 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
33607 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
33608 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
33609 case ISD::VECTOR_COMPRESS: return lowerVECTOR_COMPRESS(Op, Subtarget, DAG);
33610 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
33611 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
33612 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
33613 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
33614 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
33615 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
33616 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
33617 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
33618 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
33619 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
33620 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
33621 case ISD::SHL_PARTS:
33622 case ISD::SRA_PARTS:
33623 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
33624 case ISD::FSHL:
33625 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
33626 case ISD::FCANONICALIZE: return LowerFCanonicalize(Op, DAG);
33628 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
33630 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
33631 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
33632 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
33633 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
33634 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
33637 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
33638 case ISD::FP_TO_SINT:
33640 case ISD::FP_TO_UINT:
33641 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
33643 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
33644 case ISD::FP_EXTEND:
33645 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
33646 case ISD::FP_ROUND:
33647 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
33648 case ISD::FP16_TO_FP:
33649 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
33650 case ISD::FP_TO_FP16:
33651 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
33652 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
33653 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
33654 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
33655 case ISD::FADD:
33656 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
33657 case ISD::FROUND: return LowerFROUND(Op, DAG);
33658 case ISD::FABS:
33659 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
33660 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
33661 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
33662 case ISD::LRINT:
33663 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
33664 case ISD::SETCC:
33665 case ISD::STRICT_FSETCC:
33666 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
33667 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
33668 case ISD::SELECT: return LowerSELECT(Op, DAG);
33669 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
33670 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
33671 case ISD::VASTART: return LowerVASTART(Op, DAG);
33672 case ISD::VAARG: return LowerVAARG(Op, DAG);
33673 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
33674 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
33676 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
33677 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
33678 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
33679 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
33681 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
33682 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
33683 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
33684 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
33685 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
33687 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
33688 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
33689 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
33690 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
33691 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
33692 case ISD::GET_FPENV_MEM: return LowerGET_FPENV_MEM(Op, DAG);
33693 case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG);
33694 case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG);
33695 case ISD::CTLZ:
33696 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
33697 case ISD::CTTZ:
33698 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
33699 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
33700 case ISD::MULHS:
33701 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
33702 case ISD::ROTL:
33703 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
33704 case ISD::SRA:
33705 case ISD::SRL:
33706 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
33707 case ISD::SADDO:
33708 case ISD::UADDO:
33709 case ISD::SSUBO:
33710 case ISD::USUBO: return LowerXALUO(Op, DAG);
33711 case ISD::SMULO:
33712 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
33713 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
33714 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
33715 case ISD::SADDO_CARRY:
33716 case ISD::SSUBO_CARRY:
33717 case ISD::UADDO_CARRY:
33718 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
33719 case ISD::ADD:
33720 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
33721 case ISD::UADDSAT:
33722 case ISD::SADDSAT:
33723 case ISD::USUBSAT:
33724 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
33725 case ISD::SMAX:
33726 case ISD::SMIN:
33727 case ISD::UMAX:
33728 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
33729 case ISD::FMINIMUM:
33730 case ISD::FMAXIMUM:
33731 case ISD::FMINIMUMNUM:
33732 case ISD::FMAXIMUMNUM:
33733 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
33734 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
33735 case ISD::ABDS:
33736 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
33737 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
33738 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
33739 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
33740 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
33741 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
33742 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
33743 case ISD::GC_TRANSITION_START:
33744 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
33745 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
33746 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
33747 case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
33748 // clang-format on
33749 }
33750}
33751
33752/// Replace a node with an illegal result type with a new node built out of
33753/// custom code.
33756 SelectionDAG &DAG) const {
33757 SDLoc dl(N);
33758 unsigned Opc = N->getOpcode();
33759 switch (Opc) {
33760 default:
33761#ifndef NDEBUG
33762 dbgs() << "ReplaceNodeResults: ";
33763 N->dump(&DAG);
33764#endif
33765 llvm_unreachable("Do not know how to custom type legalize this operation!");
33766 case X86ISD::CVTPH2PS: {
33767 EVT VT = N->getValueType(0);
33768 SDValue Lo, Hi;
33769 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33770 EVT LoVT, HiVT;
33771 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33772 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
33773 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
33774 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33775 Results.push_back(Res);
33776 return;
33777 }
33779 EVT VT = N->getValueType(0);
33780 SDValue Lo, Hi;
33781 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
33782 EVT LoVT, HiVT;
33783 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33784 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
33785 {N->getOperand(0), Lo});
33786 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
33787 {N->getOperand(0), Hi});
33788 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33789 Lo.getValue(1), Hi.getValue(1));
33790 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33791 Results.push_back(Res);
33792 Results.push_back(Chain);
33793 return;
33794 }
33795 case X86ISD::CVTPS2PH:
33796 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
33797 return;
33798 case ISD::CTPOP: {
33799 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33800 // If we have at most 32 active bits, then perform as i32 CTPOP.
33801 // TODO: Perform this in generic legalizer?
33802 KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
33803 unsigned LZ = Known.countMinLeadingZeros();
33804 unsigned TZ = Known.countMinTrailingZeros();
33805 if ((LZ + TZ) >= 32) {
33806 SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
33807 DAG.getShiftAmountConstant(TZ, MVT::i64, dl));
33808 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);
33809 Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);
33810 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);
33811 Results.push_back(Op);
33812 return;
33813 }
33814 // Use a v2i64 if possible.
33815 bool NoImplicitFloatOps =
33817 Attribute::NoImplicitFloat);
33818 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
33819 SDValue Wide =
33820 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
33821 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
33822 // Bit count should fit in 32-bits, extract it as that and then zero
33823 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
33824 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
33825 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
33826 DAG.getVectorIdxConstant(0, dl));
33827 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
33828 Results.push_back(Wide);
33829 }
33830 return;
33831 }
33832 case ISD::MUL: {
33833 EVT VT = N->getValueType(0);
33835 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
33836 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
33837 // elements are needed.
33838 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
33839 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
33840 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
33841 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
33842 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33843 unsigned NumConcats = 16 / VT.getVectorNumElements();
33844 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
33845 ConcatOps[0] = Res;
33846 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
33847 Results.push_back(Res);
33848 return;
33849 }
33850 case ISD::SMULO:
33851 case ISD::UMULO: {
33852 EVT VT = N->getValueType(0);
33854 VT == MVT::v2i32 && "Unexpected VT!");
33855 bool IsSigned = Opc == ISD::SMULO;
33856 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
33857 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
33858 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
33859 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
33860 // Extract the high 32 bits from each result using PSHUFD.
33861 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
33862 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
33863 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
33864 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
33865 DAG.getVectorIdxConstant(0, dl));
33866
33867 // Truncate the low bits of the result. This will become PSHUFD.
33868 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33869
33870 SDValue HiCmp;
33871 if (IsSigned) {
33872 // SMULO overflows if the high bits don't match the sign of the low.
33873 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
33874 } else {
33875 // UMULO overflows if the high bits are non-zero.
33876 HiCmp = DAG.getConstant(0, dl, VT);
33877 }
33878 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
33879
33880 // Widen the result with by padding with undef.
33881 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
33882 DAG.getUNDEF(VT));
33883 Results.push_back(Res);
33884 Results.push_back(Ovf);
33885 return;
33886 }
33887 case X86ISD::VPMADDWD: {
33888 // Legalize types for X86ISD::VPMADDWD by widening.
33889 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33890
33891 EVT VT = N->getValueType(0);
33892 EVT InVT = N->getOperand(0).getValueType();
33893 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
33894 "Expected a VT that divides into 128 bits.");
33896 "Unexpected type action!");
33897 unsigned NumConcat = 128 / InVT.getSizeInBits();
33898
33899 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
33900 InVT.getVectorElementType(),
33901 NumConcat * InVT.getVectorNumElements());
33902 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
33904 NumConcat * VT.getVectorNumElements());
33905
33906 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
33907 Ops[0] = N->getOperand(0);
33908 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33909 Ops[0] = N->getOperand(1);
33910 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33911
33912 SDValue Res = DAG.getNode(Opc, dl, WideVT, InVec0, InVec1);
33913 Results.push_back(Res);
33914 return;
33915 }
33916 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
33917 case X86ISD::FMINC:
33918 case X86ISD::FMIN:
33919 case X86ISD::FMAXC:
33920 case X86ISD::FMAX:
33922 case X86ISD::STRICT_FMAX: {
33923 EVT VT = N->getValueType(0);
33924 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
33925 bool IsStrict = Opc == X86ISD::STRICT_FMIN || Opc == X86ISD::STRICT_FMAX;
33926 SDValue UNDEF = DAG.getUNDEF(VT);
33927 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33928 N->getOperand(IsStrict ? 1 : 0), UNDEF);
33929 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33930 N->getOperand(IsStrict ? 2 : 1), UNDEF);
33931 SDValue Res;
33932 if (IsStrict)
33933 Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
33934 {N->getOperand(0), LHS, RHS});
33935 else
33936 Res = DAG.getNode(Opc, dl, MVT::v4f32, LHS, RHS);
33937 Results.push_back(Res);
33938 if (IsStrict)
33939 Results.push_back(Res.getValue(1));
33940 return;
33941 }
33942 case ISD::SDIV:
33943 case ISD::UDIV:
33944 case ISD::SREM:
33945 case ISD::UREM: {
33946 EVT VT = N->getValueType(0);
33947 if (VT.isVector()) {
33949 "Unexpected type action!");
33950 // If this RHS is a constant splat vector we can widen this and let
33951 // division/remainder by constant optimize it.
33952 // TODO: Can we do something for non-splat?
33953 APInt SplatVal;
33954 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
33955 unsigned NumConcats = 128 / VT.getSizeInBits();
33956 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
33957 Ops0[0] = N->getOperand(0);
33958 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
33959 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
33960 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
33961 SDValue Res = DAG.getNode(Opc, dl, ResVT, N0, N1);
33962 Results.push_back(Res);
33963 }
33964 return;
33965 }
33966
33967 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
33968 Results.push_back(V);
33969 return;
33970 }
33971 case ISD::TRUNCATE: {
33972 MVT VT = N->getSimpleValueType(0);
33973 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
33974 return;
33975
33976 // The generic legalizer will try to widen the input type to the same
33977 // number of elements as the widened result type. But this isn't always
33978 // the best thing so do some custom legalization to avoid some cases.
33979 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
33980 SDValue In = N->getOperand(0);
33981 EVT InVT = In.getValueType();
33982 EVT InEltVT = InVT.getVectorElementType();
33983 EVT EltVT = VT.getVectorElementType();
33984 unsigned MinElts = VT.getVectorNumElements();
33985 unsigned WidenNumElts = WidenVT.getVectorNumElements();
33986 unsigned InBits = InVT.getSizeInBits();
33987
33988 // See if there are sufficient leading bits to perform a PACKUS/PACKSS.
33989 unsigned PackOpcode;
33990 if (SDValue Src = matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG,
33991 Subtarget, N->getFlags())) {
33992 if (SDValue Res =
33993 truncateVectorWithPACK(PackOpcode, VT, Src, dl, DAG, Subtarget)) {
33994 Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
33995 Results.push_back(Res);
33996 return;
33997 }
33998 }
33999
34000 if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
34001 // 128 bit and smaller inputs should avoid truncate all together and
34002 // use a shuffle.
34003 if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
34004 int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
34005 SmallVector<int, 16> TruncMask(WidenNumElts, -1);
34006 for (unsigned I = 0; I < MinElts; ++I)
34007 TruncMask[I] = Scale * I;
34008 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
34009 assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
34010 "Illegal vector type in truncation");
34011 WidenIn = DAG.getBitcast(WidenVT, WidenIn);
34012 Results.push_back(
34013 DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
34014 return;
34015 }
34016 }
34017
34018 // With AVX512 there are some cases that can use a target specific
34019 // truncate node to go from 256/512 to less than 128 with zeros in the
34020 // upper elements of the 128 bit result.
34021 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
34022 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
34023 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
34024 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34025 return;
34026 }
34027 // There's one case we can widen to 512 bits and use VTRUNC.
34028 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
34029 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
34030 DAG.getUNDEF(MVT::v4i64));
34031 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34032 return;
34033 }
34034 }
34035 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
34036 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
34037 isTypeLegal(MVT::v4i64)) {
34038 // Input needs to be split and output needs to widened. Let's use two
34039 // VTRUNCs, and shuffle their results together into the wider type.
34040 SDValue Lo, Hi;
34041 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
34042
34043 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
34044 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
34045 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
34046 { 0, 1, 2, 3, 16, 17, 18, 19,
34047 -1, -1, -1, -1, -1, -1, -1, -1 });
34048 Results.push_back(Res);
34049 return;
34050 }
34051
34052 // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
34053 // this via type legalization.
34054 if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
34055 (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
34056 (!Subtarget.hasSSSE3() ||
34057 (!isTypeLegal(InVT) &&
34058 !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {
34059 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
34060 InEltVT.getSizeInBits() * WidenNumElts);
34061 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
34062 return;
34063 }
34064
34065 return;
34066 }
34067 case ISD::ANY_EXTEND:
34068 // Right now, only MVT::v8i8 has Custom action for an illegal type.
34069 // It's intended to custom handle the input type.
34070 assert(N->getValueType(0) == MVT::v8i8 &&
34071 "Do not know how to legalize this Node");
34072 return;
34073 case ISD::SIGN_EXTEND:
34074 case ISD::ZERO_EXTEND: {
34075 EVT VT = N->getValueType(0);
34076 SDValue In = N->getOperand(0);
34077 EVT InVT = In.getValueType();
34078 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
34079 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
34081 "Unexpected type action!");
34082 assert(Opc == ISD::SIGN_EXTEND && "Unexpected opcode");
34083 // Custom split this so we can extend i8/i16->i32 invec. This is better
34084 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
34085 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
34086 // we allow the sra from the extend to i32 to be shared by the split.
34087 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
34088
34089 // Fill a vector with sign bits for each element.
34090 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
34091 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
34092
34093 // Create an unpackl and unpackh to interleave the sign bits then bitcast
34094 // to v2i64.
34095 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34096 {0, 4, 1, 5});
34097 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
34098 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34099 {2, 6, 3, 7});
34100 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
34101
34102 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34103 Results.push_back(Res);
34104 return;
34105 }
34106
34107 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
34108 if (!InVT.is128BitVector()) {
34109 // Not a 128 bit vector, but maybe type legalization will promote
34110 // it to 128 bits.
34111 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
34112 return;
34113 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
34114 if (!InVT.is128BitVector())
34115 return;
34116
34117 // Promote the input to 128 bits. Type legalization will turn this into
34118 // zext_inreg/sext_inreg.
34119 In = DAG.getNode(Opc, dl, InVT, In);
34120 }
34121
34122 // Perform custom splitting instead of the two stage extend we would get
34123 // by default.
34124 EVT LoVT, HiVT;
34125 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
34126 assert(isTypeLegal(LoVT) && "Split VT not legal?");
34127
34128 SDValue Lo = getEXTEND_VECTOR_INREG(Opc, dl, LoVT, In, DAG);
34129
34130 // We need to shift the input over by half the number of elements.
34131 unsigned NumElts = InVT.getVectorNumElements();
34132 unsigned HalfNumElts = NumElts / 2;
34133 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
34134 for (unsigned i = 0; i != HalfNumElts; ++i)
34135 ShufMask[i] = i + HalfNumElts;
34136
34137 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
34138 Hi = getEXTEND_VECTOR_INREG(Opc, dl, HiVT, Hi, DAG);
34139
34140 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34141 Results.push_back(Res);
34142 }
34143 return;
34144 }
34146 case ISD::FP_TO_UINT_SAT: {
34147 if (!Subtarget.hasAVX10_2())
34148 return;
34149
34150 bool IsSigned = Opc == ISD::FP_TO_SINT_SAT;
34151 EVT VT = N->getValueType(0);
34152 SDValue Op = N->getOperand(0);
34153 EVT OpVT = Op.getValueType();
34154 SDValue Res;
34155
34156 if (VT == MVT::v2i32 && OpVT == MVT::v2f64) {
34157 if (IsSigned)
34158 Res = DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v4i32, Op);
34159 else
34160 Res = DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v4i32, Op);
34161 Results.push_back(Res);
34162 }
34163 return;
34164 }
34165 case ISD::FP_TO_SINT:
34167 case ISD::FP_TO_UINT:
34169 bool IsStrict = N->isStrictFPOpcode();
34170 bool IsSigned = Opc == ISD::FP_TO_SINT || Opc == ISD::STRICT_FP_TO_SINT;
34171 EVT VT = N->getValueType(0);
34172 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34173 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34174 EVT SrcVT = Src.getValueType();
34175
34176 SDValue Res;
34177 if (isSoftF16(SrcVT, Subtarget)) {
34178 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
34179 if (IsStrict) {
34180 Res =
34181 DAG.getNode(Opc, dl, {VT, MVT::Other},
34182 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
34183 {NVT, MVT::Other}, {Chain, Src})});
34184 Chain = Res.getValue(1);
34185 } else {
34186 Res =
34187 DAG.getNode(Opc, dl, VT, DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
34188 }
34189 Results.push_back(Res);
34190 if (IsStrict)
34191 Results.push_back(Chain);
34192
34193 return;
34194 }
34195
34196 if (VT.isVector() && Subtarget.hasFP16() && Subtarget.hasVLX() &&
34197 SrcVT.getVectorElementType() == MVT::f16) {
34198 EVT EleVT = VT.getVectorElementType();
34199 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
34200
34201 if (SrcVT != MVT::v8f16) {
34202 SDValue Tmp =
34203 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
34204 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
34205 Ops[0] = Src;
34206 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
34207 }
34208
34209 if (IsStrict) {
34211 Res =
34212 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
34213 Chain = Res.getValue(1);
34214 } else {
34215 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34216 Res = DAG.getNode(Opc, dl, ResVT, Src);
34217 }
34218
34219 // TODO: Need to add exception check code for strict FP.
34220 if (EleVT.getSizeInBits() < 16) {
34221 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
34222 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
34223
34224 // Now widen to 128 bits.
34225 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
34226 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
34227 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
34228 ConcatOps[0] = Res;
34229 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34230 }
34231
34232 Results.push_back(Res);
34233 if (IsStrict)
34234 Results.push_back(Chain);
34235
34236 return;
34237 }
34238
34239 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
34241 "Unexpected type action!");
34242
34243 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
34244 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
34245 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
34247 SDValue Res;
34248 SDValue Chain;
34249 if (IsStrict) {
34250 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
34251 {N->getOperand(0), Src});
34252 Chain = Res.getValue(1);
34253 } else
34254 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
34255
34256 // Preserve what we know about the size of the original result. If the
34257 // result is v2i32, we have to manually widen the assert.
34258 if (PromoteVT == MVT::v2i32)
34259 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
34260 DAG.getUNDEF(MVT::v2i32));
34261
34262 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
34263 Res.getValueType(), Res,
34265
34266 if (PromoteVT == MVT::v2i32)
34267 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
34268 DAG.getVectorIdxConstant(0, dl));
34269
34270 // Truncate back to the original width.
34271 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34272
34273 // Now widen to 128 bits.
34274 unsigned NumConcats = 128 / VT.getSizeInBits();
34276 VT.getVectorNumElements() * NumConcats);
34277 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
34278 ConcatOps[0] = Res;
34279 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34280 Results.push_back(Res);
34281 if (IsStrict)
34282 Results.push_back(Chain);
34283 return;
34284 }
34285
34286
34287 if (VT == MVT::v2i32) {
34288 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
34289 "Strict unsigned conversion requires AVX512");
34290 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34292 "Unexpected type action!");
34293 if (Src.getValueType() == MVT::v2f64) {
34294 if (!IsSigned && !Subtarget.hasAVX512()) {
34295 SDValue Res =
34296 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
34297 Results.push_back(Res);
34298 return;
34299 }
34300
34301 if (IsStrict)
34303 else
34304 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34305
34306 // If we have VLX we can emit a target specific FP_TO_UINT node,.
34307 if (!IsSigned && !Subtarget.hasVLX()) {
34308 // Otherwise we can defer to the generic legalizer which will widen
34309 // the input as well. This will be further widened during op
34310 // legalization to v8i32<-v8f64.
34311 // For strict nodes we'll need to widen ourselves.
34312 // FIXME: Fix the type legalizer to safely widen strict nodes?
34313 if (!IsStrict)
34314 return;
34315 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
34316 DAG.getConstantFP(0.0, dl, MVT::v2f64));
34317 Opc = N->getOpcode();
34318 }
34319 SDValue Res;
34320 SDValue Chain;
34321 if (IsStrict) {
34322 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34323 {N->getOperand(0), Src});
34324 Chain = Res.getValue(1);
34325 } else {
34326 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
34327 }
34328 Results.push_back(Res);
34329 if (IsStrict)
34330 Results.push_back(Chain);
34331 return;
34332 }
34333
34334 // Custom widen strict v2f32->v2i32 by padding with zeros.
34335 // FIXME: Should generic type legalizer do this?
34336 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
34337 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
34338 DAG.getConstantFP(0.0, dl, MVT::v2f32));
34339 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34340 {N->getOperand(0), Src});
34341 Results.push_back(Res);
34342 Results.push_back(Res.getValue(1));
34343 return;
34344 }
34345
34346 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
34347 // so early out here.
34348 return;
34349 }
34350
34351 assert(!VT.isVector() && "Vectors should have been handled above!");
34352
34353 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
34354 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
34355 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
34356 assert(!Subtarget.is64Bit() && "i64 should be legal");
34357 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
34358 // If we use a 128-bit result we might need to use a target specific node.
34359 unsigned SrcElts =
34360 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
34361 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
34362 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
34363 if (NumElts != SrcElts) {
34364 if (IsStrict)
34366 else
34367 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34368 }
34369
34370 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
34371 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
34372 DAG.getConstantFP(0.0, dl, VecInVT), Src,
34373 ZeroIdx);
34374 SDValue Chain;
34375 if (IsStrict) {
34376 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
34377 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
34378 Chain = Res.getValue(1);
34379 } else
34380 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
34381 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
34382 Results.push_back(Res);
34383 if (IsStrict)
34384 Results.push_back(Chain);
34385 return;
34386 }
34387
34388 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
34389 SDValue Chain;
34390 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
34391 Results.push_back(V);
34392 if (IsStrict)
34393 Results.push_back(Chain);
34394 return;
34395 }
34396
34397 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
34398 Results.push_back(V);
34399 if (IsStrict)
34400 Results.push_back(Chain);
34401 }
34402 return;
34403 }
34404 case ISD::LRINT:
34405 if (N->getValueType(0) == MVT::v2i32) {
34406 SDValue Src = N->getOperand(0);
34407 if (Subtarget.hasFP16() && Src.getValueType() == MVT::v2f16) {
34408 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src,
34409 DAG.getUNDEF(MVT::v2f16));
34410 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Src,
34411 DAG.getUNDEF(MVT::v4f16));
34412 } else if (Src.getValueType() != MVT::v2f64) {
34413 return;
34414 }
34415 Results.push_back(DAG.getNode(X86ISD::CVTP2SI, dl, MVT::v4i32, Src));
34416 return;
34417 }
34418 [[fallthrough]];
34419 case ISD::LLRINT: {
34420 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
34421 Results.push_back(V);
34422 return;
34423 }
34424
34425 case ISD::SINT_TO_FP:
34427 case ISD::UINT_TO_FP:
34429 bool IsStrict = N->isStrictFPOpcode();
34430 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
34431 EVT VT = N->getValueType(0);
34432 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34433 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
34434 Subtarget.hasVLX()) {
34435 if (Src.getValueType().getVectorElementType() == MVT::i16)
34436 return;
34437
34438 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
34439 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34440 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
34441 : DAG.getUNDEF(MVT::v2i32));
34442 if (IsStrict) {
34443 unsigned Opc =
34445 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
34446 {N->getOperand(0), Src});
34447 Results.push_back(Res);
34448 Results.push_back(Res.getValue(1));
34449 } else {
34450 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34451 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
34452 }
34453 return;
34454 }
34455 if (VT != MVT::v2f32)
34456 return;
34457 EVT SrcVT = Src.getValueType();
34458 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
34459 if (IsStrict) {
34460 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
34462 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34463 {N->getOperand(0), Src});
34464 Results.push_back(Res);
34465 Results.push_back(Res.getValue(1));
34466 } else {
34467 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34468 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
34469 }
34470 return;
34471 }
34472 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
34473 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
34474 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
34475 SDValue One = DAG.getConstant(1, dl, SrcVT);
34476 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
34477 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
34478 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
34479 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
34480 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
34481 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
34482 for (int i = 0; i != 2; ++i) {
34483 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
34484 SignSrc, DAG.getVectorIdxConstant(i, dl));
34485 if (IsStrict)
34486 SignCvts[i] =
34487 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
34488 {N->getOperand(0), Elt});
34489 else
34490 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
34491 };
34492 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
34493 SDValue Slow, Chain;
34494 if (IsStrict) {
34495 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34496 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
34497 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
34498 {Chain, SignCvt, SignCvt});
34499 Chain = Slow.getValue(1);
34500 } else {
34501 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
34502 }
34503 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
34504 IsNeg =
34505 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
34506 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
34507 Results.push_back(Cvt);
34508 if (IsStrict)
34509 Results.push_back(Chain);
34510 return;
34511 }
34512
34513 if (SrcVT != MVT::v2i32)
34514 return;
34515
34516 if (IsSigned || Subtarget.hasAVX512()) {
34517 if (!IsStrict)
34518 return;
34519
34520 // Custom widen strict v2i32->v2f32 to avoid scalarization.
34521 // FIXME: Should generic type legalizer do this?
34522 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34523 DAG.getConstant(0, dl, MVT::v2i32));
34524 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34525 {N->getOperand(0), Src});
34526 Results.push_back(Res);
34527 Results.push_back(Res.getValue(1));
34528 return;
34529 }
34530
34531 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34532 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
34533 SDValue VBias = DAG.getConstantFP(
34534 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
34535 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
34536 DAG.getBitcast(MVT::v2i64, VBias));
34537 Or = DAG.getBitcast(MVT::v2f64, Or);
34538 if (IsStrict) {
34539 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
34540 {N->getOperand(0), Or, VBias});
34542 {MVT::v4f32, MVT::Other},
34543 {Sub.getValue(1), Sub});
34544 Results.push_back(Res);
34545 Results.push_back(Res.getValue(1));
34546 } else {
34547 // TODO: Are there any fast-math-flags to propagate here?
34548 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
34549 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
34550 }
34551 return;
34552 }
34554 case ISD::FP_ROUND: {
34555 bool IsStrict = N->isStrictFPOpcode();
34556 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34557 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34558 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
34559 EVT SrcVT = Src.getValueType();
34560 EVT VT = N->getValueType(0);
34561 SDValue V;
34562 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
34563 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
34564 : DAG.getUNDEF(MVT::v2f32);
34565 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
34566 }
34567 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
34568 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
34569 if (SrcVT.getVectorElementType() != MVT::f32)
34570 return;
34571
34572 if (IsStrict)
34573 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
34574 {Chain, Src, Rnd});
34575 else
34576 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
34577
34578 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
34579 if (IsStrict)
34580 Results.push_back(V.getValue(1));
34581 return;
34582 }
34583 if (!isTypeLegal(Src.getValueType()))
34584 return;
34585 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
34586 if (IsStrict)
34587 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
34588 {Chain, Src});
34589 else
34590 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
34591 Results.push_back(V);
34592 if (IsStrict)
34593 Results.push_back(V.getValue(1));
34594 return;
34595 }
34596 case ISD::FP_EXTEND:
34597 case ISD::STRICT_FP_EXTEND: {
34598 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
34599 // No other ValueType for FP_EXTEND should reach this point.
34600 assert(N->getValueType(0) == MVT::v2f32 &&
34601 "Do not know how to legalize this Node");
34602 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
34603 return;
34604 bool IsStrict = N->isStrictFPOpcode();
34605 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34606 if (Src.getValueType().getVectorElementType() != MVT::f16)
34607 return;
34608 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
34609 : DAG.getUNDEF(MVT::v2f16);
34610 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
34611 if (IsStrict)
34612 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
34613 {N->getOperand(0), V});
34614 else
34615 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
34616 Results.push_back(V);
34617 if (IsStrict)
34618 Results.push_back(V.getValue(1));
34619 return;
34620 }
34622 unsigned IntNo = N->getConstantOperandVal(1);
34623 switch (IntNo) {
34624 default : llvm_unreachable("Do not know how to custom type "
34625 "legalize this intrinsic operation!");
34626 case Intrinsic::x86_rdtsc:
34627 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
34628 Results);
34629 case Intrinsic::x86_rdtscp:
34630 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
34631 Results);
34632 case Intrinsic::x86_rdpmc:
34633 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
34634 Results);
34635 return;
34636 case Intrinsic::x86_rdpru:
34637 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
34638 Results);
34639 return;
34640 case Intrinsic::x86_xgetbv:
34641 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
34642 Results);
34643 return;
34644 }
34645 }
34646 case ISD::READCYCLECOUNTER: {
34647 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
34648 }
34649 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
34650 EVT T = N->getValueType(0);
34651 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
34652 bool Regs64bit = T == MVT::i128;
34653 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
34654 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
34655 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
34656 SDValue cpInL, cpInH;
34657 std::tie(cpInL, cpInH) =
34658 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
34659 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
34660 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
34661 cpInH =
34662 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
34663 cpInH, cpInL.getValue(1));
34664 SDValue swapInL, swapInH;
34665 std::tie(swapInL, swapInH) =
34666 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
34667 swapInH =
34668 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
34669 swapInH, cpInH.getValue(1));
34670
34671 // In 64-bit mode we might need the base pointer in RBX, but we can't know
34672 // until later. So we keep the RBX input in a vreg and use a custom
34673 // inserter.
34674 // Since RBX will be a reserved register the register allocator will not
34675 // make sure its value will be properly saved and restored around this
34676 // live-range.
34677 SDValue Result;
34678 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
34679 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
34680 if (Regs64bit) {
34681 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
34682 swapInH.getValue(1)};
34683 Result =
34684 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
34685 } else {
34686 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
34687 swapInH.getValue(1));
34688 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
34689 swapInL.getValue(1)};
34690 Result =
34691 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
34692 }
34693
34694 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
34695 Regs64bit ? X86::RAX : X86::EAX,
34696 HalfT, Result.getValue(1));
34697 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
34698 Regs64bit ? X86::RDX : X86::EDX,
34699 HalfT, cpOutL.getValue(2));
34700 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
34701
34702 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
34703 MVT::i32, cpOutH.getValue(2));
34704 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
34705 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
34706
34707 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
34708 Results.push_back(Success);
34709 Results.push_back(EFLAGS.getValue(1));
34710 return;
34711 }
34712 case ISD::ATOMIC_LOAD: {
34713 assert(
34714 (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
34715 "Unexpected VT!");
34716 bool NoImplicitFloatOps =
34718 Attribute::NoImplicitFloat);
34719 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
34720 auto *Node = cast<AtomicSDNode>(N);
34721
34722 if (N->getValueType(0) == MVT::i128) {
34723 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
34724 SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
34725 Node->getBasePtr(), Node->getMemOperand());
34726 SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34727 DAG.getVectorIdxConstant(0, dl));
34728 SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34729 DAG.getVectorIdxConstant(1, dl));
34730 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
34731 {ResL, ResH}));
34732 Results.push_back(Ld.getValue(1));
34733 return;
34734 }
34735 break;
34736 }
34737 if (Subtarget.hasSSE1()) {
34738 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
34739 // Then extract the lower 64-bits.
34740 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
34741 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
34742 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34744 MVT::i64, Node->getMemOperand());
34745 if (Subtarget.hasSSE2()) {
34746 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34747 DAG.getVectorIdxConstant(0, dl));
34748 Results.push_back(Res);
34749 Results.push_back(Ld.getValue(1));
34750 return;
34751 }
34752 // We use an alternative sequence for SSE1 that extracts as v2f32 and
34753 // then casts to i64. This avoids a 128-bit stack temporary being
34754 // created by type legalization if we were to cast v4f32->v2i64.
34755 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
34756 DAG.getVectorIdxConstant(0, dl));
34757 Res = DAG.getBitcast(MVT::i64, Res);
34758 Results.push_back(Res);
34759 Results.push_back(Ld.getValue(1));
34760 return;
34761 }
34762 if (Subtarget.hasX87()) {
34763 // First load this into an 80-bit X87 register. This will put the whole
34764 // integer into the significand.
34765 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
34766 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34768 dl, Tys, Ops, MVT::i64,
34769 Node->getMemOperand());
34770 SDValue Chain = Result.getValue(1);
34771
34772 // Now store the X87 register to a stack temporary and convert to i64.
34773 // This store is not atomic and doesn't need to be.
34774 // FIXME: We don't need a stack temporary if the result of the load
34775 // is already being stored. We could just directly store there.
34776 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
34777 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
34778 MachinePointerInfo MPI =
34780 SDValue StoreOps[] = { Chain, Result, StackPtr };
34781 Chain = DAG.getMemIntrinsicNode(
34782 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
34783 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
34784
34785 // Finally load the value back from the stack temporary and return it.
34786 // This load is not atomic and doesn't need to be.
34787 // This load will be further type legalized.
34788 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
34789 Results.push_back(Result);
34790 Results.push_back(Result.getValue(1));
34791 return;
34792 }
34793 }
34794 // TODO: Use MOVLPS when SSE1 is available?
34795 // Delegate to generic TypeLegalization. Situations we can really handle
34796 // should have already been dealt with by AtomicExpandPass.cpp.
34797 break;
34798 }
34799 case ISD::ATOMIC_SWAP:
34800 case ISD::ATOMIC_LOAD_ADD:
34801 case ISD::ATOMIC_LOAD_SUB:
34802 case ISD::ATOMIC_LOAD_AND:
34803 case ISD::ATOMIC_LOAD_OR:
34804 case ISD::ATOMIC_LOAD_XOR:
34805 case ISD::ATOMIC_LOAD_NAND:
34806 case ISD::ATOMIC_LOAD_MIN:
34807 case ISD::ATOMIC_LOAD_MAX:
34808 case ISD::ATOMIC_LOAD_UMIN:
34809 case ISD::ATOMIC_LOAD_UMAX:
34810 // Delegate to generic TypeLegalization. Situations we can really handle
34811 // should have already been dealt with by AtomicExpandPass.cpp.
34812 break;
34813
34814 case ISD::BITCAST: {
34815 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34816 EVT DstVT = N->getValueType(0);
34817 EVT SrcVT = N->getOperand(0).getValueType();
34818
34819 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
34820 // we can split using the k-register rather than memory.
34821 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
34822 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
34823 SDValue Lo, Hi;
34824 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
34825 Lo = DAG.getBitcast(MVT::i32, Lo);
34826 Hi = DAG.getBitcast(MVT::i32, Hi);
34827 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
34828 Results.push_back(Res);
34829 return;
34830 }
34831
34832 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
34833 // FIXME: Use v4f32 for SSE1?
34834 assert(Subtarget.hasSSE2() && "Requires SSE2");
34835 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
34836 "Unexpected type action!");
34837 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
34838 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
34839 N->getOperand(0));
34840 Res = DAG.getBitcast(WideVT, Res);
34841 Results.push_back(Res);
34842 return;
34843 }
34844
34845 return;
34846 }
34847 case ISD::MGATHER: {
34848 EVT VT = N->getValueType(0);
34849 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
34850 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
34851 auto *Gather = cast<MaskedGatherSDNode>(N);
34852 SDValue Index = Gather->getIndex();
34853 if (Index.getValueType() != MVT::v2i64)
34854 return;
34856 "Unexpected type action!");
34857 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34858 SDValue Mask = Gather->getMask();
34859 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
34860 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
34861 Gather->getPassThru(),
34862 DAG.getUNDEF(VT));
34863 if (!Subtarget.hasVLX()) {
34864 // We need to widen the mask, but the instruction will only use 2
34865 // of its elements. So we can use undef.
34866 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
34867 DAG.getUNDEF(MVT::v2i1));
34868 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
34869 }
34870 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
34871 Gather->getBasePtr(), Index, Gather->getScale() };
34872 SDValue Res = DAG.getMemIntrinsicNode(
34873 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
34874 Gather->getMemoryVT(), Gather->getMemOperand());
34875 Results.push_back(Res);
34876 Results.push_back(Res.getValue(1));
34877 return;
34878 }
34879 return;
34880 }
34881 case ISD::LOAD: {
34882 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
34883 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
34884 // cast since type legalization will try to use an i64 load.
34885 MVT VT = N->getSimpleValueType(0);
34886 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
34888 "Unexpected type action!");
34889 if (!ISD::isNON_EXTLoad(N))
34890 return;
34891 auto *Ld = cast<LoadSDNode>(N);
34892 if (Subtarget.hasSSE2()) {
34893 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
34894 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
34895 Ld->getPointerInfo(), Ld->getBaseAlign(),
34896 Ld->getMemOperand()->getFlags());
34897 SDValue Chain = Res.getValue(1);
34898 MVT VecVT = MVT::getVectorVT(LdVT, 2);
34899 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
34900 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34901 Res = DAG.getBitcast(WideVT, Res);
34902 Results.push_back(Res);
34903 Results.push_back(Chain);
34904 return;
34905 }
34906 assert(Subtarget.hasSSE1() && "Expected SSE");
34907 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
34908 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
34910 MVT::i64, Ld->getMemOperand());
34911 Results.push_back(Res);
34912 Results.push_back(Res.getValue(1));
34913 return;
34914 }
34915 case ISD::ADDRSPACECAST: {
34916 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
34917 Results.push_back(V);
34918 return;
34919 }
34920 case ISD::BITREVERSE: {
34921 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
34922 assert((Subtarget.hasXOP() || Subtarget.hasGFNI()) && "Expected XOP/GFNI");
34923 // We can use VPPERM/GF2P8AFFINEQB by copying to a vector register and back.
34924 // We'll need to move the scalar in two i32 pieces.
34925 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
34926 return;
34927 }
34929 // f16 = extract vXf16 %vec, i64 %idx
34930 assert(N->getSimpleValueType(0) == MVT::f16 &&
34931 "Unexpected Value type of EXTRACT_VECTOR_ELT!");
34932 assert(Subtarget.hasFP16() && "Expected FP16");
34933 SDValue VecOp = N->getOperand(0);
34935 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
34936 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
34937 N->getOperand(1));
34938 Split = DAG.getBitcast(MVT::f16, Split);
34939 Results.push_back(Split);
34940 return;
34941 }
34942 }
34943}
34944
34945const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
34946 switch ((X86ISD::NodeType)Opcode) {
34947 case X86ISD::FIRST_NUMBER: break;
34948#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
34949 NODE_NAME_CASE(BSF)
34950 NODE_NAME_CASE(BSR)
34951 NODE_NAME_CASE(FSHL)
34952 NODE_NAME_CASE(FSHR)
34953 NODE_NAME_CASE(FAND)
34954 NODE_NAME_CASE(FANDN)
34955 NODE_NAME_CASE(FOR)
34956 NODE_NAME_CASE(FXOR)
34957 NODE_NAME_CASE(FILD)
34958 NODE_NAME_CASE(FIST)
34959 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
34960 NODE_NAME_CASE(FLD)
34961 NODE_NAME_CASE(FST)
34962 NODE_NAME_CASE(CALL)
34963 NODE_NAME_CASE(CALL_RVMARKER)
34964 NODE_NAME_CASE(IMP_CALL)
34966 NODE_NAME_CASE(CMP)
34967 NODE_NAME_CASE(FCMP)
34968 NODE_NAME_CASE(STRICT_FCMP)
34969 NODE_NAME_CASE(STRICT_FCMPS)
34971 NODE_NAME_CASE(UCOMI)
34972 NODE_NAME_CASE(COMX)
34973 NODE_NAME_CASE(UCOMX)
34974 NODE_NAME_CASE(CMPM)
34975 NODE_NAME_CASE(CMPMM)
34976 NODE_NAME_CASE(STRICT_CMPM)
34977 NODE_NAME_CASE(CMPMM_SAE)
34978 NODE_NAME_CASE(SETCC)
34979 NODE_NAME_CASE(SETCC_CARRY)
34980 NODE_NAME_CASE(FSETCC)
34981 NODE_NAME_CASE(FSETCCM)
34982 NODE_NAME_CASE(FSETCCM_SAE)
34983 NODE_NAME_CASE(CMOV)
34984 NODE_NAME_CASE(BRCOND)
34985 NODE_NAME_CASE(RET_GLUE)
34986 NODE_NAME_CASE(IRET)
34987 NODE_NAME_CASE(REP_STOS)
34988 NODE_NAME_CASE(REP_MOVS)
34989 NODE_NAME_CASE(GlobalBaseReg)
34991 NODE_NAME_CASE(WrapperRIP)
34992 NODE_NAME_CASE(MOVQ2DQ)
34993 NODE_NAME_CASE(MOVDQ2Q)
34994 NODE_NAME_CASE(MMX_MOVD2W)
34995 NODE_NAME_CASE(MMX_MOVW2D)
34996 NODE_NAME_CASE(PEXTRB)
34997 NODE_NAME_CASE(PEXTRW)
34998 NODE_NAME_CASE(INSERTPS)
34999 NODE_NAME_CASE(PINSRB)
35000 NODE_NAME_CASE(PINSRW)
35001 NODE_NAME_CASE(PSHUFB)
35002 NODE_NAME_CASE(ANDNP)
35003 NODE_NAME_CASE(BLENDI)
35005 NODE_NAME_CASE(HADD)
35006 NODE_NAME_CASE(HSUB)
35007 NODE_NAME_CASE(FHADD)
35008 NODE_NAME_CASE(FHSUB)
35009 NODE_NAME_CASE(CONFLICT)
35010 NODE_NAME_CASE(FMAX)
35011 NODE_NAME_CASE(FMAXS)
35012 NODE_NAME_CASE(FMAX_SAE)
35013 NODE_NAME_CASE(FMAXS_SAE)
35014 NODE_NAME_CASE(STRICT_FMAX)
35015 NODE_NAME_CASE(FMIN)
35016 NODE_NAME_CASE(FMINS)
35017 NODE_NAME_CASE(FMIN_SAE)
35018 NODE_NAME_CASE(FMINS_SAE)
35019 NODE_NAME_CASE(STRICT_FMIN)
35020 NODE_NAME_CASE(FMAXC)
35021 NODE_NAME_CASE(FMINC)
35022 NODE_NAME_CASE(FRSQRT)
35023 NODE_NAME_CASE(FRCP)
35024 NODE_NAME_CASE(EXTRQI)
35025 NODE_NAME_CASE(INSERTQI)
35026 NODE_NAME_CASE(TLSADDR)
35027 NODE_NAME_CASE(TLSBASEADDR)
35028 NODE_NAME_CASE(TLSCALL)
35029 NODE_NAME_CASE(TLSDESC)
35030 NODE_NAME_CASE(EH_SJLJ_SETJMP)
35031 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
35032 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
35033 NODE_NAME_CASE(EH_RETURN)
35034 NODE_NAME_CASE(TC_RETURN)
35035 NODE_NAME_CASE(FNSTCW16m)
35036 NODE_NAME_CASE(FLDCW16m)
35037 NODE_NAME_CASE(FNSTENVm)
35038 NODE_NAME_CASE(FLDENVm)
35039 NODE_NAME_CASE(LCMPXCHG_DAG)
35040 NODE_NAME_CASE(LCMPXCHG8_DAG)
35041 NODE_NAME_CASE(LCMPXCHG16_DAG)
35042 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
35043 NODE_NAME_CASE(LADD)
35044 NODE_NAME_CASE(LSUB)
35045 NODE_NAME_CASE(LOR)
35046 NODE_NAME_CASE(LXOR)
35047 NODE_NAME_CASE(LAND)
35048 NODE_NAME_CASE(LBTS)
35049 NODE_NAME_CASE(LBTC)
35050 NODE_NAME_CASE(LBTR)
35051 NODE_NAME_CASE(LBTS_RM)
35052 NODE_NAME_CASE(LBTC_RM)
35053 NODE_NAME_CASE(LBTR_RM)
35054 NODE_NAME_CASE(AADD)
35055 NODE_NAME_CASE(AOR)
35056 NODE_NAME_CASE(AXOR)
35057 NODE_NAME_CASE(AAND)
35058 NODE_NAME_CASE(VZEXT_MOVL)
35059 NODE_NAME_CASE(VZEXT_LOAD)
35060 NODE_NAME_CASE(VEXTRACT_STORE)
35061 NODE_NAME_CASE(VTRUNC)
35062 NODE_NAME_CASE(VTRUNCS)
35063 NODE_NAME_CASE(VTRUNCUS)
35064 NODE_NAME_CASE(VMTRUNC)
35065 NODE_NAME_CASE(VMTRUNCS)
35066 NODE_NAME_CASE(VMTRUNCUS)
35067 NODE_NAME_CASE(VTRUNCSTORES)
35068 NODE_NAME_CASE(VTRUNCSTOREUS)
35069 NODE_NAME_CASE(VMTRUNCSTORES)
35070 NODE_NAME_CASE(VMTRUNCSTOREUS)
35071 NODE_NAME_CASE(VFPEXT)
35072 NODE_NAME_CASE(STRICT_VFPEXT)
35073 NODE_NAME_CASE(VFPEXT_SAE)
35074 NODE_NAME_CASE(VFPEXTS)
35075 NODE_NAME_CASE(VFPEXTS_SAE)
35076 NODE_NAME_CASE(VFPROUND)
35077 NODE_NAME_CASE(VFPROUND2)
35078 NODE_NAME_CASE(VFPROUND2_RND)
35079 NODE_NAME_CASE(STRICT_VFPROUND)
35080 NODE_NAME_CASE(VMFPROUND)
35081 NODE_NAME_CASE(VFPROUND_RND)
35082 NODE_NAME_CASE(VFPROUNDS)
35083 NODE_NAME_CASE(VFPROUNDS_RND)
35084 NODE_NAME_CASE(VSHLDQ)
35085 NODE_NAME_CASE(VSRLDQ)
35086 NODE_NAME_CASE(VSHL)
35087 NODE_NAME_CASE(VSRL)
35088 NODE_NAME_CASE(VSRA)
35089 NODE_NAME_CASE(VSHLI)
35090 NODE_NAME_CASE(VSRLI)
35091 NODE_NAME_CASE(VSRAI)
35092 NODE_NAME_CASE(VSHLV)
35093 NODE_NAME_CASE(VSRLV)
35094 NODE_NAME_CASE(VSRAV)
35095 NODE_NAME_CASE(VROTLI)
35096 NODE_NAME_CASE(VROTRI)
35097 NODE_NAME_CASE(VPPERM)
35098 NODE_NAME_CASE(CMPP)
35099 NODE_NAME_CASE(STRICT_CMPP)
35100 NODE_NAME_CASE(PCMPEQ)
35101 NODE_NAME_CASE(PCMPGT)
35102 NODE_NAME_CASE(PHMINPOS)
35103 NODE_NAME_CASE(ADD)
35104 NODE_NAME_CASE(SUB)
35105 NODE_NAME_CASE(ADC)
35106 NODE_NAME_CASE(SBB)
35107 NODE_NAME_CASE(SMUL)
35108 NODE_NAME_CASE(UMUL)
35109 NODE_NAME_CASE(OR)
35110 NODE_NAME_CASE(XOR)
35111 NODE_NAME_CASE(AND)
35112 NODE_NAME_CASE(BEXTR)
35114 NODE_NAME_CASE(BZHI)
35115 NODE_NAME_CASE(PDEP)
35116 NODE_NAME_CASE(PEXT)
35117 NODE_NAME_CASE(MUL_IMM)
35118 NODE_NAME_CASE(MOVMSK)
35119 NODE_NAME_CASE(PTEST)
35120 NODE_NAME_CASE(TESTP)
35121 NODE_NAME_CASE(KORTEST)
35122 NODE_NAME_CASE(KTEST)
35123 NODE_NAME_CASE(KADD)
35124 NODE_NAME_CASE(KSHIFTL)
35125 NODE_NAME_CASE(KSHIFTR)
35126 NODE_NAME_CASE(PACKSS)
35127 NODE_NAME_CASE(PACKUS)
35128 NODE_NAME_CASE(PALIGNR)
35129 NODE_NAME_CASE(VALIGN)
35130 NODE_NAME_CASE(VSHLD)
35131 NODE_NAME_CASE(VSHRD)
35132 NODE_NAME_CASE(VSHLDV)
35133 NODE_NAME_CASE(VSHRDV)
35134 NODE_NAME_CASE(PSHUFD)
35135 NODE_NAME_CASE(PSHUFHW)
35136 NODE_NAME_CASE(PSHUFLW)
35137 NODE_NAME_CASE(SHUFP)
35138 NODE_NAME_CASE(SHUF128)
35139 NODE_NAME_CASE(MOVLHPS)
35140 NODE_NAME_CASE(MOVHLPS)
35141 NODE_NAME_CASE(MOVDDUP)
35142 NODE_NAME_CASE(MOVSHDUP)
35143 NODE_NAME_CASE(MOVSLDUP)
35144 NODE_NAME_CASE(MOVSD)
35145 NODE_NAME_CASE(MOVSS)
35146 NODE_NAME_CASE(MOVSH)
35147 NODE_NAME_CASE(UNPCKL)
35148 NODE_NAME_CASE(UNPCKH)
35149 NODE_NAME_CASE(VBROADCAST)
35150 NODE_NAME_CASE(VBROADCAST_LOAD)
35151 NODE_NAME_CASE(VBROADCASTM)
35152 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
35153 NODE_NAME_CASE(VPERMILPV)
35154 NODE_NAME_CASE(VPERMILPI)
35155 NODE_NAME_CASE(VPERM2X128)
35156 NODE_NAME_CASE(VPERMV)
35157 NODE_NAME_CASE(VPERMV3)
35158 NODE_NAME_CASE(VPERMI)
35159 NODE_NAME_CASE(VPTERNLOG)
35160 NODE_NAME_CASE(FP_TO_SINT_SAT)
35161 NODE_NAME_CASE(FP_TO_UINT_SAT)
35162 NODE_NAME_CASE(VFIXUPIMM)
35163 NODE_NAME_CASE(VFIXUPIMM_SAE)
35164 NODE_NAME_CASE(VFIXUPIMMS)
35165 NODE_NAME_CASE(VFIXUPIMMS_SAE)
35166 NODE_NAME_CASE(VRANGE)
35167 NODE_NAME_CASE(VRANGE_SAE)
35168 NODE_NAME_CASE(VRANGES)
35169 NODE_NAME_CASE(VRANGES_SAE)
35170 NODE_NAME_CASE(PMULUDQ)
35171 NODE_NAME_CASE(PMULDQ)
35172 NODE_NAME_CASE(PSADBW)
35173 NODE_NAME_CASE(DBPSADBW)
35174 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
35175 NODE_NAME_CASE(VAARG_64)
35176 NODE_NAME_CASE(VAARG_X32)
35177 NODE_NAME_CASE(DYN_ALLOCA)
35178 NODE_NAME_CASE(MFENCE)
35179 NODE_NAME_CASE(SEG_ALLOCA)
35180 NODE_NAME_CASE(PROBED_ALLOCA)
35183 NODE_NAME_CASE(RDPKRU)
35184 NODE_NAME_CASE(WRPKRU)
35185 NODE_NAME_CASE(VPMADDUBSW)
35186 NODE_NAME_CASE(VPMADDWD)
35187 NODE_NAME_CASE(VPSHA)
35188 NODE_NAME_CASE(VPSHL)
35189 NODE_NAME_CASE(VPCOM)
35190 NODE_NAME_CASE(VPCOMU)
35191 NODE_NAME_CASE(VPERMIL2)
35193 NODE_NAME_CASE(STRICT_FMSUB)
35195 NODE_NAME_CASE(STRICT_FNMADD)
35197 NODE_NAME_CASE(STRICT_FNMSUB)
35198 NODE_NAME_CASE(FMADDSUB)
35199 NODE_NAME_CASE(FMSUBADD)
35200 NODE_NAME_CASE(FMADD_RND)
35201 NODE_NAME_CASE(FNMADD_RND)
35202 NODE_NAME_CASE(FMSUB_RND)
35203 NODE_NAME_CASE(FNMSUB_RND)
35204 NODE_NAME_CASE(FMADDSUB_RND)
35205 NODE_NAME_CASE(FMSUBADD_RND)
35206 NODE_NAME_CASE(VFMADDC)
35207 NODE_NAME_CASE(VFMADDC_RND)
35208 NODE_NAME_CASE(VFCMADDC)
35209 NODE_NAME_CASE(VFCMADDC_RND)
35210 NODE_NAME_CASE(VFMULC)
35211 NODE_NAME_CASE(VFMULC_RND)
35212 NODE_NAME_CASE(VFCMULC)
35213 NODE_NAME_CASE(VFCMULC_RND)
35214 NODE_NAME_CASE(VFMULCSH)
35215 NODE_NAME_CASE(VFMULCSH_RND)
35216 NODE_NAME_CASE(VFCMULCSH)
35217 NODE_NAME_CASE(VFCMULCSH_RND)
35218 NODE_NAME_CASE(VFMADDCSH)
35219 NODE_NAME_CASE(VFMADDCSH_RND)
35220 NODE_NAME_CASE(VFCMADDCSH)
35221 NODE_NAME_CASE(VFCMADDCSH_RND)
35222 NODE_NAME_CASE(VPMADD52H)
35223 NODE_NAME_CASE(VPMADD52L)
35224 NODE_NAME_CASE(VRNDSCALE)
35225 NODE_NAME_CASE(STRICT_VRNDSCALE)
35226 NODE_NAME_CASE(VRNDSCALE_SAE)
35227 NODE_NAME_CASE(VRNDSCALES)
35228 NODE_NAME_CASE(VRNDSCALES_SAE)
35229 NODE_NAME_CASE(VREDUCE)
35230 NODE_NAME_CASE(VREDUCE_SAE)
35231 NODE_NAME_CASE(VREDUCES)
35232 NODE_NAME_CASE(VREDUCES_SAE)
35233 NODE_NAME_CASE(VGETMANT)
35234 NODE_NAME_CASE(VGETMANT_SAE)
35235 NODE_NAME_CASE(VGETMANTS)
35236 NODE_NAME_CASE(VGETMANTS_SAE)
35237 NODE_NAME_CASE(PCMPESTR)
35238 NODE_NAME_CASE(PCMPISTR)
35240 NODE_NAME_CASE(COMPRESS)
35242 NODE_NAME_CASE(SELECTS)
35243 NODE_NAME_CASE(ADDSUB)
35244 NODE_NAME_CASE(RCP14)
35245 NODE_NAME_CASE(RCP14S)
35246 NODE_NAME_CASE(RSQRT14)
35247 NODE_NAME_CASE(RSQRT14S)
35248 NODE_NAME_CASE(FADD_RND)
35249 NODE_NAME_CASE(FADDS)
35250 NODE_NAME_CASE(FADDS_RND)
35251 NODE_NAME_CASE(FSUB_RND)
35252 NODE_NAME_CASE(FSUBS)
35253 NODE_NAME_CASE(FSUBS_RND)
35254 NODE_NAME_CASE(FMUL_RND)
35255 NODE_NAME_CASE(FMULS)
35256 NODE_NAME_CASE(FMULS_RND)
35257 NODE_NAME_CASE(FDIV_RND)
35258 NODE_NAME_CASE(FDIVS)
35259 NODE_NAME_CASE(FDIVS_RND)
35260 NODE_NAME_CASE(FSQRT_RND)
35261 NODE_NAME_CASE(FSQRTS)
35262 NODE_NAME_CASE(FSQRTS_RND)
35263 NODE_NAME_CASE(FGETEXP)
35264 NODE_NAME_CASE(FGETEXP_SAE)
35265 NODE_NAME_CASE(FGETEXPS)
35266 NODE_NAME_CASE(FGETEXPS_SAE)
35267 NODE_NAME_CASE(SCALEF)
35268 NODE_NAME_CASE(SCALEF_RND)
35269 NODE_NAME_CASE(SCALEFS)
35270 NODE_NAME_CASE(SCALEFS_RND)
35271 NODE_NAME_CASE(MULHRS)
35272 NODE_NAME_CASE(SINT_TO_FP_RND)
35273 NODE_NAME_CASE(UINT_TO_FP_RND)
35274 NODE_NAME_CASE(CVTTP2SI)
35275 NODE_NAME_CASE(CVTTP2UI)
35276 NODE_NAME_CASE(STRICT_CVTTP2SI)
35277 NODE_NAME_CASE(STRICT_CVTTP2UI)
35278 NODE_NAME_CASE(MCVTTP2SI)
35279 NODE_NAME_CASE(MCVTTP2UI)
35280 NODE_NAME_CASE(CVTTP2SI_SAE)
35281 NODE_NAME_CASE(CVTTP2UI_SAE)
35282 NODE_NAME_CASE(CVTTS2SI)
35283 NODE_NAME_CASE(CVTTS2UI)
35284 NODE_NAME_CASE(CVTTS2SI_SAE)
35285 NODE_NAME_CASE(CVTTS2UI_SAE)
35286 NODE_NAME_CASE(CVTSI2P)
35287 NODE_NAME_CASE(CVTUI2P)
35288 NODE_NAME_CASE(STRICT_CVTSI2P)
35289 NODE_NAME_CASE(STRICT_CVTUI2P)
35290 NODE_NAME_CASE(MCVTSI2P)
35291 NODE_NAME_CASE(MCVTUI2P)
35292 NODE_NAME_CASE(VFPCLASS)
35293 NODE_NAME_CASE(VFPCLASSS)
35294 NODE_NAME_CASE(MULTISHIFT)
35295 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
35296 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
35297 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
35298 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
35299 NODE_NAME_CASE(CVTPS2PH)
35300 NODE_NAME_CASE(STRICT_CVTPS2PH)
35301 NODE_NAME_CASE(CVTPS2PH_SAE)
35302 NODE_NAME_CASE(MCVTPS2PH)
35303 NODE_NAME_CASE(MCVTPS2PH_SAE)
35304 NODE_NAME_CASE(CVTPH2PS)
35305 NODE_NAME_CASE(STRICT_CVTPH2PS)
35306 NODE_NAME_CASE(CVTPH2PS_SAE)
35307 NODE_NAME_CASE(CVTP2SI)
35308 NODE_NAME_CASE(CVTP2UI)
35309 NODE_NAME_CASE(MCVTP2SI)
35310 NODE_NAME_CASE(MCVTP2UI)
35311 NODE_NAME_CASE(CVTP2SI_RND)
35312 NODE_NAME_CASE(CVTP2UI_RND)
35313 NODE_NAME_CASE(CVTS2SI)
35314 NODE_NAME_CASE(CVTS2UI)
35315 NODE_NAME_CASE(CVTS2SI_RND)
35316 NODE_NAME_CASE(CVTS2UI_RND)
35317 NODE_NAME_CASE(CVTNEPS2BF16)
35318 NODE_NAME_CASE(MCVTNEPS2BF16)
35319 NODE_NAME_CASE(DPBF16PS)
35320 NODE_NAME_CASE(DPFP16PS)
35321 NODE_NAME_CASE(MPSADBW)
35322 NODE_NAME_CASE(LWPINS)
35323 NODE_NAME_CASE(MGATHER)
35324 NODE_NAME_CASE(MSCATTER)
35325 NODE_NAME_CASE(VPDPBUSD)
35326 NODE_NAME_CASE(VPDPBUSDS)
35327 NODE_NAME_CASE(VPDPWSSD)
35328 NODE_NAME_CASE(VPDPWSSDS)
35329 NODE_NAME_CASE(VPSHUFBITQMB)
35330 NODE_NAME_CASE(GF2P8MULB)
35331 NODE_NAME_CASE(GF2P8AFFINEQB)
35332 NODE_NAME_CASE(GF2P8AFFINEINVQB)
35333 NODE_NAME_CASE(NT_CALL)
35334 NODE_NAME_CASE(NT_BRIND)
35335 NODE_NAME_CASE(UMWAIT)
35336 NODE_NAME_CASE(TPAUSE)
35337 NODE_NAME_CASE(ENQCMD)
35338 NODE_NAME_CASE(ENQCMDS)
35339 NODE_NAME_CASE(VP2INTERSECT)
35340 NODE_NAME_CASE(VPDPBSUD)
35341 NODE_NAME_CASE(VPDPBSUDS)
35342 NODE_NAME_CASE(VPDPBUUD)
35343 NODE_NAME_CASE(VPDPBUUDS)
35344 NODE_NAME_CASE(VPDPBSSD)
35345 NODE_NAME_CASE(VPDPBSSDS)
35346 NODE_NAME_CASE(VPDPWSUD)
35347 NODE_NAME_CASE(VPDPWSUDS)
35348 NODE_NAME_CASE(VPDPWUSD)
35349 NODE_NAME_CASE(VPDPWUSDS)
35350 NODE_NAME_CASE(VPDPWUUD)
35351 NODE_NAME_CASE(VPDPWUUDS)
35352 NODE_NAME_CASE(VMINMAX)
35353 NODE_NAME_CASE(VMINMAX_SAE)
35354 NODE_NAME_CASE(VMINMAXS)
35355 NODE_NAME_CASE(VMINMAXS_SAE)
35356 NODE_NAME_CASE(CVTP2IBS)
35357 NODE_NAME_CASE(CVTP2IUBS)
35358 NODE_NAME_CASE(CVTP2IBS_RND)
35359 NODE_NAME_CASE(CVTP2IUBS_RND)
35360 NODE_NAME_CASE(CVTTP2IBS)
35361 NODE_NAME_CASE(CVTTP2IUBS)
35362 NODE_NAME_CASE(CVTTP2IBS_SAE)
35363 NODE_NAME_CASE(CVTTP2IUBS_SAE)
35364 NODE_NAME_CASE(VCVT2PH2BF8)
35365 NODE_NAME_CASE(VCVT2PH2BF8S)
35366 NODE_NAME_CASE(VCVT2PH2HF8)
35367 NODE_NAME_CASE(VCVT2PH2HF8S)
35368 NODE_NAME_CASE(VCVTBIASPH2BF8)
35369 NODE_NAME_CASE(VCVTBIASPH2BF8S)
35370 NODE_NAME_CASE(VCVTBIASPH2HF8)
35371 NODE_NAME_CASE(VCVTBIASPH2HF8S)
35372 NODE_NAME_CASE(VCVTPH2BF8)
35373 NODE_NAME_CASE(VCVTPH2BF8S)
35374 NODE_NAME_CASE(VCVTPH2HF8)
35375 NODE_NAME_CASE(VCVTPH2HF8S)
35376 NODE_NAME_CASE(VMCVTBIASPH2BF8)
35377 NODE_NAME_CASE(VMCVTBIASPH2BF8S)
35378 NODE_NAME_CASE(VMCVTBIASPH2HF8)
35379 NODE_NAME_CASE(VMCVTBIASPH2HF8S)
35380 NODE_NAME_CASE(VMCVTPH2BF8)
35381 NODE_NAME_CASE(VMCVTPH2BF8S)
35382 NODE_NAME_CASE(VMCVTPH2HF8)
35383 NODE_NAME_CASE(VMCVTPH2HF8S)
35384 NODE_NAME_CASE(VCVTHF82PH)
35385 NODE_NAME_CASE(AESENC128KL)
35386 NODE_NAME_CASE(AESDEC128KL)
35387 NODE_NAME_CASE(AESENC256KL)
35388 NODE_NAME_CASE(AESDEC256KL)
35389 NODE_NAME_CASE(AESENCWIDE128KL)
35390 NODE_NAME_CASE(AESDECWIDE128KL)
35391 NODE_NAME_CASE(AESENCWIDE256KL)
35392 NODE_NAME_CASE(AESDECWIDE256KL)
35393 NODE_NAME_CASE(CMPCCXADD)
35394 NODE_NAME_CASE(TESTUI)
35395 NODE_NAME_CASE(FP80_ADD)
35396 NODE_NAME_CASE(STRICT_FP80_ADD)
35397 NODE_NAME_CASE(CCMP)
35398 NODE_NAME_CASE(CTEST)
35399 NODE_NAME_CASE(CLOAD)
35400 NODE_NAME_CASE(CSTORE)
35401 NODE_NAME_CASE(CVTTS2SIS)
35402 NODE_NAME_CASE(CVTTS2UIS)
35403 NODE_NAME_CASE(CVTTS2SIS_SAE)
35404 NODE_NAME_CASE(CVTTS2UIS_SAE)
35405 NODE_NAME_CASE(CVTTP2SIS)
35406 NODE_NAME_CASE(MCVTTP2SIS)
35407 NODE_NAME_CASE(CVTTP2UIS_SAE)
35408 NODE_NAME_CASE(CVTTP2SIS_SAE)
35409 NODE_NAME_CASE(CVTTP2UIS)
35410 NODE_NAME_CASE(MCVTTP2UIS)
35411 NODE_NAME_CASE(POP_FROM_X87_REG)
35412 }
35413 return nullptr;
35414#undef NODE_NAME_CASE
35415}
35416
35417/// Return true if the addressing mode represented by AM is legal for this
35418/// target, for a load/store of the specified type.
35420 const AddrMode &AM, Type *Ty,
35421 unsigned AS,
35422 Instruction *I) const {
35423 // X86 supports extremely general addressing modes.
35425
35426 // X86 allows a sign-extended 32-bit immediate field as a displacement.
35427 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
35428 return false;
35429
35430 if (AM.BaseGV) {
35431 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
35432
35433 // If a reference to this global requires an extra load, we can't fold it.
35434 if (isGlobalStubReference(GVFlags))
35435 return false;
35436
35437 // If BaseGV requires a register for the PIC base, we cannot also have a
35438 // BaseReg specified.
35439 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
35440 return false;
35441
35442 // If lower 4G is not available, then we must use rip-relative addressing.
35443 if ((M != CodeModel::Small || isPositionIndependent()) &&
35444 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
35445 return false;
35446 }
35447
35448 switch (AM.Scale) {
35449 case 0:
35450 case 1:
35451 case 2:
35452 case 4:
35453 case 8:
35454 // These scales always work.
35455 break;
35456 case 3:
35457 case 5:
35458 case 9:
35459 // These scales are formed with basereg+scalereg. Only accept if there is
35460 // no basereg yet.
35461 if (AM.HasBaseReg)
35462 return false;
35463 break;
35464 default: // Other stuff never works.
35465 return false;
35466 }
35467
35468 return true;
35469}
35470
35471bool X86TargetLowering::isBinOp(unsigned Opcode) const {
35472 switch (Opcode) {
35473 // These are non-commutative binops.
35474 // TODO: Add more X86ISD opcodes once we have test coverage.
35475 case X86ISD::ANDNP:
35476 case X86ISD::PCMPGT:
35477 case X86ISD::FMAX:
35478 case X86ISD::FMIN:
35479 case X86ISD::FANDN:
35480 case X86ISD::VPSHA:
35481 case X86ISD::VPSHL:
35482 case X86ISD::VSHLV:
35483 case X86ISD::VSRLV:
35484 case X86ISD::VSRAV:
35485 return true;
35486 }
35487
35488 return TargetLoweringBase::isBinOp(Opcode);
35489}
35490
35491bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
35492 switch (Opcode) {
35493 // TODO: Add more X86ISD opcodes once we have test coverage.
35494 case X86ISD::PCMPEQ:
35495 case X86ISD::PMULDQ:
35496 case X86ISD::PMULUDQ:
35497 case X86ISD::FMAXC:
35498 case X86ISD::FMINC:
35499 case X86ISD::FAND:
35500 case X86ISD::FOR:
35501 case X86ISD::FXOR:
35502 return true;
35503 }
35504
35506}
35507
35509 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35510 return false;
35511 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
35512 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
35513 return NumBits1 > NumBits2;
35514}
35515
35517 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35518 return false;
35519
35520 if (!isTypeLegal(EVT::getEVT(Ty1)))
35521 return false;
35522
35523 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
35524
35525 // Assuming the caller doesn't have a zeroext or signext return parameter,
35526 // truncation all the way down to i1 is valid.
35527 return true;
35528}
35529
35531 return isInt<32>(Imm);
35532}
35533
35535 // Can also use sub to handle negated immediates.
35536 return isInt<32>(Imm);
35537}
35538
35540 return isInt<32>(Imm);
35541}
35542
35544 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
35545 return false;
35546 unsigned NumBits1 = VT1.getSizeInBits();
35547 unsigned NumBits2 = VT2.getSizeInBits();
35548 return NumBits1 > NumBits2;
35549}
35550
35552 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35553 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
35554}
35555
35557 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35558 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
35559}
35560
35562 EVT VT1 = Val.getValueType();
35563 if (isZExtFree(VT1, VT2))
35564 return true;
35565
35566 if (Val.getOpcode() != ISD::LOAD)
35567 return false;
35568
35569 if (!VT1.isSimple() || !VT1.isInteger() ||
35570 !VT2.isSimple() || !VT2.isInteger())
35571 return false;
35572
35573 switch (VT1.getSimpleVT().SimpleTy) {
35574 default: break;
35575 case MVT::i8:
35576 case MVT::i16:
35577 case MVT::i32:
35578 // X86 has 8, 16, and 32-bit zero-extending loads.
35579 return true;
35580 }
35581
35582 return false;
35583}
35584
35586 if (!Subtarget.is64Bit())
35587 return false;
35588 return TargetLowering::shouldConvertPhiType(From, To);
35589}
35590
35592 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
35593 return false;
35594
35595 EVT SrcVT = ExtVal.getOperand(0).getValueType();
35596
35597 // There is no extending load for vXi1.
35598 if (SrcVT.getScalarType() == MVT::i1)
35599 return false;
35600
35601 return true;
35602}
35603
35605 EVT VT) const {
35606 if (Subtarget.useSoftFloat())
35607 return false;
35608
35609 if (!Subtarget.hasAnyFMA())
35610 return false;
35611
35612 VT = VT.getScalarType();
35613
35614 if (!VT.isSimple())
35615 return false;
35616
35617 switch (VT.getSimpleVT().SimpleTy) {
35618 case MVT::f16:
35619 return Subtarget.hasFP16();
35620 case MVT::f32:
35621 case MVT::f64:
35622 return true;
35623 default:
35624 break;
35625 }
35626
35627 return false;
35628}
35629
35631 EVT DestVT) const {
35632 // i16 instructions are longer (0x66 prefix) and potentially slower.
35633 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
35634}
35635
35637 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
35638 SDValue Y) const {
35639 if (SelectOpcode == ISD::SELECT) {
35640 if (VT.isVector())
35641 return false;
35642 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
35643 return false;
35644 using namespace llvm::SDPatternMatch;
35645 // BLSI
35646 if (BinOpcode == ISD::AND && (sd_match(Y, m_Neg(m_Specific(X))) ||
35648 return true;
35649 // BLSR
35650 if (BinOpcode == ISD::AND &&
35653 return true;
35654 // BLSMSK
35655 if (BinOpcode == ISD::XOR &&
35658 return true;
35659
35660 return false;
35661 }
35662 // TODO: This is too general. There are cases where pre-AVX512 codegen would
35663 // benefit. The transform may also be profitable for scalar code.
35664 if (!Subtarget.hasAVX512())
35665 return false;
35666 if (!Subtarget.hasVLX() && !VT.is512BitVector())
35667 return false;
35668 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
35669 return false;
35670
35671 return true;
35672}
35673
35674/// Targets can use this to indicate that they only support *some*
35675/// VECTOR_SHUFFLE operations, those with specific masks.
35676/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
35677/// are assumed to be legal.
35679 if (!VT.isSimple())
35680 return false;
35681
35682 // Not for i1 vectors
35683 if (VT.getSimpleVT().getScalarType() == MVT::i1)
35684 return false;
35685
35686 // Very little shuffling can be done for 64-bit vectors right now.
35687 if (VT.getSimpleVT().getSizeInBits() == 64)
35688 return false;
35689
35690 // We only care that the types being shuffled are legal. The lowering can
35691 // handle any possible shuffle mask that results.
35692 return isTypeLegal(VT.getSimpleVT());
35693}
35694
35696 EVT VT) const {
35697 // Don't convert an 'and' into a shuffle that we don't directly support.
35698 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
35699 if (!Subtarget.hasAVX2())
35700 if (VT == MVT::v32i8 || VT == MVT::v16i16)
35701 return false;
35702
35703 // Just delegate to the generic legality, clear masks aren't special.
35704 return isShuffleMaskLegal(Mask, VT);
35705}
35706
35708 // If the subtarget is using thunks, we need to not generate jump tables.
35709 if (Subtarget.useIndirectThunkBranches())
35710 return false;
35711
35712 // Otherwise, fallback on the generic logic.
35714}
35715
35717 EVT ConditionVT) const {
35718 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
35719 // zero-extensions.
35720 if (ConditionVT.getSizeInBits() < 32)
35721 return MVT::i32;
35723 ConditionVT);
35724}
35725
35726//===----------------------------------------------------------------------===//
35727// X86 Scheduler Hooks
35728//===----------------------------------------------------------------------===//
35729
35730/// Utility function to emit xbegin specifying the start of an RTM region.
35732 const TargetInstrInfo *TII) {
35733 const MIMetadata MIMD(MI);
35734
35735 const BasicBlock *BB = MBB->getBasicBlock();
35736 MachineFunction::iterator I = ++MBB->getIterator();
35737
35738 // For the v = xbegin(), we generate
35739 //
35740 // thisMBB:
35741 // xbegin sinkMBB
35742 //
35743 // mainMBB:
35744 // s0 = -1
35745 //
35746 // fallBB:
35747 // eax = # XABORT_DEF
35748 // s1 = eax
35749 //
35750 // sinkMBB:
35751 // v = phi(s0/mainBB, s1/fallBB)
35752
35753 MachineBasicBlock *thisMBB = MBB;
35754 MachineFunction *MF = MBB->getParent();
35755 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35756 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35757 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35758 MF->insert(I, mainMBB);
35759 MF->insert(I, fallMBB);
35760 MF->insert(I, sinkMBB);
35761
35762 if (isPhysRegUsedAfter(X86::EFLAGS, MI)) {
35763 mainMBB->addLiveIn(X86::EFLAGS);
35764 fallMBB->addLiveIn(X86::EFLAGS);
35765 sinkMBB->addLiveIn(X86::EFLAGS);
35766 }
35767
35768 // Transfer the remainder of BB and its successor edges to sinkMBB.
35769 sinkMBB->splice(sinkMBB->begin(), MBB,
35770 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35772
35774 Register DstReg = MI.getOperand(0).getReg();
35775 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35776 Register mainDstReg = MRI.createVirtualRegister(RC);
35777 Register fallDstReg = MRI.createVirtualRegister(RC);
35778
35779 // thisMBB:
35780 // xbegin fallMBB
35781 // # fallthrough to mainMBB
35782 // # abortion to fallMBB
35783 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
35784 thisMBB->addSuccessor(mainMBB);
35785 thisMBB->addSuccessor(fallMBB);
35786
35787 // mainMBB:
35788 // mainDstReg := -1
35789 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
35790 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35791 mainMBB->addSuccessor(sinkMBB);
35792
35793 // fallMBB:
35794 // ; pseudo instruction to model hardware's definition from XABORT
35795 // EAX := XABORT_DEF
35796 // fallDstReg := EAX
35797 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
35798 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
35799 .addReg(X86::EAX);
35800 fallMBB->addSuccessor(sinkMBB);
35801
35802 // sinkMBB:
35803 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
35804 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35805 .addReg(mainDstReg).addMBB(mainMBB)
35806 .addReg(fallDstReg).addMBB(fallMBB);
35807
35808 MI.eraseFromParent();
35809 return sinkMBB;
35810}
35811
35813X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
35814 MachineBasicBlock *MBB) const {
35815 // Emit va_arg instruction on X86-64.
35816
35817 // Operands to this pseudo-instruction:
35818 // 0 ) Output : destination address (reg)
35819 // 1-5) Input : va_list address (addr, i64mem)
35820 // 6 ) ArgSize : Size (in bytes) of vararg type
35821 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
35822 // 8 ) Align : Alignment of type
35823 // 9 ) EFLAGS (implicit-def)
35824
35825 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
35826 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
35827
35828 Register DestReg = MI.getOperand(0).getReg();
35829 MachineOperand &Base = MI.getOperand(1);
35830 MachineOperand &Scale = MI.getOperand(2);
35831 MachineOperand &Index = MI.getOperand(3);
35832 MachineOperand &Disp = MI.getOperand(4);
35833 MachineOperand &Segment = MI.getOperand(5);
35834 unsigned ArgSize = MI.getOperand(6).getImm();
35835 unsigned ArgMode = MI.getOperand(7).getImm();
35836 Align Alignment = Align(MI.getOperand(8).getImm());
35837
35838 MachineFunction *MF = MBB->getParent();
35839
35840 // Memory Reference
35841 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
35842
35843 MachineMemOperand *OldMMO = MI.memoperands().front();
35844
35845 // Clone the MMO into two separate MMOs for loading and storing
35846 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
35847 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
35848 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
35849 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
35850
35851 // Machine Information
35852 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35853 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
35854 const TargetRegisterClass *AddrRegClass =
35856 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
35857 const MIMetadata MIMD(MI);
35858
35859 // struct va_list {
35860 // i32 gp_offset
35861 // i32 fp_offset
35862 // i64 overflow_area (address)
35863 // i64 reg_save_area (address)
35864 // }
35865 // sizeof(va_list) = 24
35866 // alignment(va_list) = 8
35867
35868 unsigned TotalNumIntRegs = 6;
35869 unsigned TotalNumXMMRegs = 8;
35870 bool UseGPOffset = (ArgMode == 1);
35871 bool UseFPOffset = (ArgMode == 2);
35872 unsigned MaxOffset = TotalNumIntRegs * 8 +
35873 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
35874
35875 /* Align ArgSize to a multiple of 8 */
35876 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
35877 bool NeedsAlign = (Alignment > 8);
35878
35879 MachineBasicBlock *thisMBB = MBB;
35880 MachineBasicBlock *overflowMBB;
35881 MachineBasicBlock *offsetMBB;
35882 MachineBasicBlock *endMBB;
35883
35884 Register OffsetDestReg; // Argument address computed by offsetMBB
35885 Register OverflowDestReg; // Argument address computed by overflowMBB
35886 Register OffsetReg;
35887
35888 if (!UseGPOffset && !UseFPOffset) {
35889 // If we only pull from the overflow region, we don't create a branch.
35890 // We don't need to alter control flow.
35891 OffsetDestReg = Register(); // unused
35892 OverflowDestReg = DestReg;
35893
35894 offsetMBB = nullptr;
35895 overflowMBB = thisMBB;
35896 endMBB = thisMBB;
35897 } else {
35898 // First emit code to check if gp_offset (or fp_offset) is below the bound.
35899 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
35900 // If not, pull from overflow_area. (branch to overflowMBB)
35901 //
35902 // thisMBB
35903 // | .
35904 // | .
35905 // offsetMBB overflowMBB
35906 // | .
35907 // | .
35908 // endMBB
35909
35910 // Registers for the PHI in endMBB
35911 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
35912 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
35913
35914 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35915 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35916 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35917 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35918
35920
35921 // Insert the new basic blocks
35922 MF->insert(MBBIter, offsetMBB);
35923 MF->insert(MBBIter, overflowMBB);
35924 MF->insert(MBBIter, endMBB);
35925
35926 // Transfer the remainder of MBB and its successor edges to endMBB.
35927 endMBB->splice(endMBB->begin(), thisMBB,
35928 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
35929 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
35930
35931 // Make offsetMBB and overflowMBB successors of thisMBB
35932 thisMBB->addSuccessor(offsetMBB);
35933 thisMBB->addSuccessor(overflowMBB);
35934
35935 // endMBB is a successor of both offsetMBB and overflowMBB
35936 offsetMBB->addSuccessor(endMBB);
35937 overflowMBB->addSuccessor(endMBB);
35938
35939 // Load the offset value into a register
35940 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35941 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
35942 .add(Base)
35943 .add(Scale)
35944 .add(Index)
35945 .addDisp(Disp, UseFPOffset ? 4 : 0)
35946 .add(Segment)
35947 .setMemRefs(LoadOnlyMMO);
35948
35949 // Check if there is enough room left to pull this argument.
35950 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
35951 .addReg(OffsetReg)
35952 .addImm(MaxOffset + 8 - ArgSizeA8);
35953
35954 // Branch to "overflowMBB" if offset >= max
35955 // Fall through to "offsetMBB" otherwise
35956 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
35957 .addMBB(overflowMBB).addImm(X86::COND_AE);
35958 }
35959
35960 // In offsetMBB, emit code to use the reg_save_area.
35961 if (offsetMBB) {
35962 assert(OffsetReg != 0);
35963
35964 // Read the reg_save_area address.
35965 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
35966 BuildMI(
35967 offsetMBB, MIMD,
35968 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35969 RegSaveReg)
35970 .add(Base)
35971 .add(Scale)
35972 .add(Index)
35973 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
35974 .add(Segment)
35975 .setMemRefs(LoadOnlyMMO);
35976
35977 if (Subtarget.isTarget64BitLP64()) {
35978 // Zero-extend the offset
35979 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
35980 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
35981 .addImm(0)
35982 .addReg(OffsetReg)
35983 .addImm(X86::sub_32bit);
35984
35985 // Add the offset to the reg_save_area to get the final address.
35986 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
35987 .addReg(OffsetReg64)
35988 .addReg(RegSaveReg);
35989 } else {
35990 // Add the offset to the reg_save_area to get the final address.
35991 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
35992 .addReg(OffsetReg)
35993 .addReg(RegSaveReg);
35994 }
35995
35996 // Compute the offset for the next argument
35997 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35998 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
35999 .addReg(OffsetReg)
36000 .addImm(UseFPOffset ? 16 : 8);
36001
36002 // Store it back into the va_list.
36003 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
36004 .add(Base)
36005 .add(Scale)
36006 .add(Index)
36007 .addDisp(Disp, UseFPOffset ? 4 : 0)
36008 .add(Segment)
36009 .addReg(NextOffsetReg)
36010 .setMemRefs(StoreOnlyMMO);
36011
36012 // Jump to endMBB
36013 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
36014 .addMBB(endMBB);
36015 }
36016
36017 //
36018 // Emit code to use overflow area
36019 //
36020
36021 // Load the overflow_area address into a register.
36022 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
36023 BuildMI(overflowMBB, MIMD,
36024 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
36025 OverflowAddrReg)
36026 .add(Base)
36027 .add(Scale)
36028 .add(Index)
36029 .addDisp(Disp, 8)
36030 .add(Segment)
36031 .setMemRefs(LoadOnlyMMO);
36032
36033 // If we need to align it, do so. Otherwise, just copy the address
36034 // to OverflowDestReg.
36035 if (NeedsAlign) {
36036 // Align the overflow address
36037 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
36038
36039 // aligned_addr = (addr + (align-1)) & ~(align-1)
36040 BuildMI(
36041 overflowMBB, MIMD,
36042 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36043 TmpReg)
36044 .addReg(OverflowAddrReg)
36045 .addImm(Alignment.value() - 1);
36046
36047 BuildMI(
36048 overflowMBB, MIMD,
36049 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
36050 OverflowDestReg)
36051 .addReg(TmpReg)
36052 .addImm(~(uint64_t)(Alignment.value() - 1));
36053 } else {
36054 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
36055 .addReg(OverflowAddrReg);
36056 }
36057
36058 // Compute the next overflow address after this argument.
36059 // (the overflow address should be kept 8-byte aligned)
36060 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
36061 BuildMI(
36062 overflowMBB, MIMD,
36063 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36064 NextAddrReg)
36065 .addReg(OverflowDestReg)
36066 .addImm(ArgSizeA8);
36067
36068 // Store the new overflow address.
36069 BuildMI(overflowMBB, MIMD,
36070 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
36071 .add(Base)
36072 .add(Scale)
36073 .add(Index)
36074 .addDisp(Disp, 8)
36075 .add(Segment)
36076 .addReg(NextAddrReg)
36077 .setMemRefs(StoreOnlyMMO);
36078
36079 // If we branched, emit the PHI to the front of endMBB.
36080 if (offsetMBB) {
36081 BuildMI(*endMBB, endMBB->begin(), MIMD,
36082 TII->get(X86::PHI), DestReg)
36083 .addReg(OffsetDestReg).addMBB(offsetMBB)
36084 .addReg(OverflowDestReg).addMBB(overflowMBB);
36085 }
36086
36087 // Erase the pseudo instruction
36088 MI.eraseFromParent();
36089
36090 return endMBB;
36091}
36092
36093// The EFLAGS operand of SelectItr might be missing a kill marker
36094// because there were multiple uses of EFLAGS, and ISel didn't know
36095// which to mark. Figure out whether SelectItr should have had a
36096// kill marker, and set it if it should. Returns the correct kill
36097// marker value.
36100 const TargetRegisterInfo* TRI) {
36101 if (isPhysRegUsedAfter(X86::EFLAGS, SelectItr))
36102 return false;
36103
36104 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
36105 // out. SelectMI should have a kill flag on EFLAGS.
36106 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
36107 return true;
36108}
36109
36110// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
36111// together with other CMOV pseudo-opcodes into a single basic-block with
36112// conditional jump around it.
36114 switch (MI.getOpcode()) {
36115 case X86::CMOV_FR16:
36116 case X86::CMOV_FR16X:
36117 case X86::CMOV_FR32:
36118 case X86::CMOV_FR32X:
36119 case X86::CMOV_FR64:
36120 case X86::CMOV_FR64X:
36121 case X86::CMOV_GR8:
36122 case X86::CMOV_GR16:
36123 case X86::CMOV_GR32:
36124 case X86::CMOV_RFP32:
36125 case X86::CMOV_RFP64:
36126 case X86::CMOV_RFP80:
36127 case X86::CMOV_VR64:
36128 case X86::CMOV_VR128:
36129 case X86::CMOV_VR128X:
36130 case X86::CMOV_VR256:
36131 case X86::CMOV_VR256X:
36132 case X86::CMOV_VR512:
36133 case X86::CMOV_VK1:
36134 case X86::CMOV_VK2:
36135 case X86::CMOV_VK4:
36136 case X86::CMOV_VK8:
36137 case X86::CMOV_VK16:
36138 case X86::CMOV_VK32:
36139 case X86::CMOV_VK64:
36140 return true;
36141
36142 default:
36143 return false;
36144 }
36145}
36146
36147// Helper function, which inserts PHI functions into SinkMBB:
36148// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
36149// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
36150// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
36151// the last PHI function inserted.
36154 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
36155 MachineBasicBlock *SinkMBB) {
36156 MachineFunction *MF = TrueMBB->getParent();
36158 const MIMetadata MIMD(*MIItBegin);
36159
36160 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
36162
36163 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
36164
36165 // As we are creating the PHIs, we have to be careful if there is more than
36166 // one. Later CMOVs may reference the results of earlier CMOVs, but later
36167 // PHIs have to reference the individual true/false inputs from earlier PHIs.
36168 // That also means that PHI construction must work forward from earlier to
36169 // later, and that the code must maintain a mapping from earlier PHI's
36170 // destination registers, and the registers that went into the PHI.
36173
36174 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
36175 Register DestReg = MIIt->getOperand(0).getReg();
36176 Register Op1Reg = MIIt->getOperand(1).getReg();
36177 Register Op2Reg = MIIt->getOperand(2).getReg();
36178
36179 // If this CMOV we are generating is the opposite condition from
36180 // the jump we generated, then we have to swap the operands for the
36181 // PHI that is going to be generated.
36182 if (MIIt->getOperand(3).getImm() == OppCC)
36183 std::swap(Op1Reg, Op2Reg);
36184
36185 if (auto It = RegRewriteTable.find(Op1Reg); It != RegRewriteTable.end())
36186 Op1Reg = It->second.first;
36187
36188 if (auto It = RegRewriteTable.find(Op2Reg); It != RegRewriteTable.end())
36189 Op2Reg = It->second.second;
36190
36191 MIB =
36192 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
36193 .addReg(Op1Reg)
36194 .addMBB(FalseMBB)
36195 .addReg(Op2Reg)
36196 .addMBB(TrueMBB);
36197
36198 // Add this PHI to the rewrite table.
36199 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
36200 }
36201
36202 return MIB;
36203}
36204
36205// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
36207X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
36208 MachineInstr &SecondCascadedCMOV,
36209 MachineBasicBlock *ThisMBB) const {
36210 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36211 const MIMetadata MIMD(FirstCMOV);
36212
36213 // We lower cascaded CMOVs such as
36214 //
36215 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
36216 //
36217 // to two successive branches.
36218 //
36219 // Without this, we would add a PHI between the two jumps, which ends up
36220 // creating a few copies all around. For instance, for
36221 //
36222 // (sitofp (zext (fcmp une)))
36223 //
36224 // we would generate:
36225 //
36226 // ucomiss %xmm1, %xmm0
36227 // movss <1.0f>, %xmm0
36228 // movaps %xmm0, %xmm1
36229 // jne .LBB5_2
36230 // xorps %xmm1, %xmm1
36231 // .LBB5_2:
36232 // jp .LBB5_4
36233 // movaps %xmm1, %xmm0
36234 // .LBB5_4:
36235 // retq
36236 //
36237 // because this custom-inserter would have generated:
36238 //
36239 // A
36240 // | \
36241 // | B
36242 // | /
36243 // C
36244 // | \
36245 // | D
36246 // | /
36247 // E
36248 //
36249 // A: X = ...; Y = ...
36250 // B: empty
36251 // C: Z = PHI [X, A], [Y, B]
36252 // D: empty
36253 // E: PHI [X, C], [Z, D]
36254 //
36255 // If we lower both CMOVs in a single step, we can instead generate:
36256 //
36257 // A
36258 // | \
36259 // | C
36260 // | /|
36261 // |/ |
36262 // | |
36263 // | D
36264 // | /
36265 // E
36266 //
36267 // A: X = ...; Y = ...
36268 // D: empty
36269 // E: PHI [X, A], [X, C], [Y, D]
36270 //
36271 // Which, in our sitofp/fcmp example, gives us something like:
36272 //
36273 // ucomiss %xmm1, %xmm0
36274 // movss <1.0f>, %xmm0
36275 // jne .LBB5_4
36276 // jp .LBB5_4
36277 // xorps %xmm0, %xmm0
36278 // .LBB5_4:
36279 // retq
36280 //
36281
36282 // We lower cascaded CMOV into two successive branches to the same block.
36283 // EFLAGS is used by both, so mark it as live in the second.
36284 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36285 MachineFunction *F = ThisMBB->getParent();
36286 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36287 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36288 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36289
36290 MachineFunction::iterator It = ++ThisMBB->getIterator();
36291 F->insert(It, FirstInsertedMBB);
36292 F->insert(It, SecondInsertedMBB);
36293 F->insert(It, SinkMBB);
36294
36295 // For a cascaded CMOV, we lower it to two successive branches to
36296 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
36297 // the FirstInsertedMBB.
36298 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
36299
36300 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36301 // live into the sink and copy blocks.
36302 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36303 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36304 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
36305 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
36306 SinkMBB->addLiveIn(X86::EFLAGS);
36307 }
36308
36309 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36310 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
36311 std::next(MachineBasicBlock::iterator(FirstCMOV)),
36312 ThisMBB->end());
36313 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36314
36315 // Fallthrough block for ThisMBB.
36316 ThisMBB->addSuccessor(FirstInsertedMBB);
36317 // The true block target of the first branch is always SinkMBB.
36318 ThisMBB->addSuccessor(SinkMBB);
36319 // Fallthrough block for FirstInsertedMBB.
36320 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
36321 // The true block for the branch of FirstInsertedMBB.
36322 FirstInsertedMBB->addSuccessor(SinkMBB);
36323 // This is fallthrough.
36324 SecondInsertedMBB->addSuccessor(SinkMBB);
36325
36326 // Create the conditional branch instructions.
36327 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
36328 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
36329
36330 X86::CondCode SecondCC =
36331 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
36332 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
36333 .addMBB(SinkMBB)
36334 .addImm(SecondCC);
36335
36336 // SinkMBB:
36337 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
36338 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
36339 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
36340 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
36341 MachineInstrBuilder MIB =
36342 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
36343 .addReg(Op1Reg)
36344 .addMBB(SecondInsertedMBB)
36345 .addReg(Op2Reg)
36346 .addMBB(ThisMBB);
36347
36348 // The second SecondInsertedMBB provides the same incoming value as the
36349 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
36350 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
36351
36352 // Now remove the CMOVs.
36353 FirstCMOV.eraseFromParent();
36354 SecondCascadedCMOV.eraseFromParent();
36355
36356 return SinkMBB;
36357}
36358
36360X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
36361 MachineBasicBlock *ThisMBB) const {
36362 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36363 const MIMetadata MIMD(MI);
36364
36365 // To "insert" a SELECT_CC instruction, we actually have to insert the
36366 // diamond control-flow pattern. The incoming instruction knows the
36367 // destination vreg to set, the condition code register to branch on, the
36368 // true/false values to select between and a branch opcode to use.
36369
36370 // ThisMBB:
36371 // ...
36372 // TrueVal = ...
36373 // cmpTY ccX, r1, r2
36374 // bCC copy1MBB
36375 // fallthrough --> FalseMBB
36376
36377 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
36378 // as described above, by inserting a BB, and then making a PHI at the join
36379 // point to select the true and false operands of the CMOV in the PHI.
36380 //
36381 // The code also handles two different cases of multiple CMOV opcodes
36382 // in a row.
36383 //
36384 // Case 1:
36385 // In this case, there are multiple CMOVs in a row, all which are based on
36386 // the same condition setting (or the exact opposite condition setting).
36387 // In this case we can lower all the CMOVs using a single inserted BB, and
36388 // then make a number of PHIs at the join point to model the CMOVs. The only
36389 // trickiness here, is that in a case like:
36390 //
36391 // t2 = CMOV cond1 t1, f1
36392 // t3 = CMOV cond1 t2, f2
36393 //
36394 // when rewriting this into PHIs, we have to perform some renaming on the
36395 // temps since you cannot have a PHI operand refer to a PHI result earlier
36396 // in the same block. The "simple" but wrong lowering would be:
36397 //
36398 // t2 = PHI t1(BB1), f1(BB2)
36399 // t3 = PHI t2(BB1), f2(BB2)
36400 //
36401 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
36402 // renaming is to note that on the path through BB1, t2 is really just a
36403 // copy of t1, and do that renaming, properly generating:
36404 //
36405 // t2 = PHI t1(BB1), f1(BB2)
36406 // t3 = PHI t1(BB1), f2(BB2)
36407 //
36408 // Case 2:
36409 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
36410 // function - EmitLoweredCascadedSelect.
36411
36412 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
36414 MachineInstr *LastCMOV = &MI;
36416
36417 // Check for case 1, where there are multiple CMOVs with the same condition
36418 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
36419 // number of jumps the most.
36420
36421 if (isCMOVPseudo(MI)) {
36422 // See if we have a string of CMOVS with the same condition. Skip over
36423 // intervening debug insts.
36424 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
36425 (NextMIIt->getOperand(3).getImm() == CC ||
36426 NextMIIt->getOperand(3).getImm() == OppCC)) {
36427 LastCMOV = &*NextMIIt;
36428 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
36429 }
36430 }
36431
36432 // This checks for case 2, but only do this if we didn't already find
36433 // case 1, as indicated by LastCMOV == MI.
36434 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
36435 NextMIIt->getOpcode() == MI.getOpcode() &&
36436 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
36437 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
36438 NextMIIt->getOperand(1).isKill()) {
36439 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
36440 }
36441
36442 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36443 MachineFunction *F = ThisMBB->getParent();
36444 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
36445 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36446
36447 MachineFunction::iterator It = ++ThisMBB->getIterator();
36448 F->insert(It, FalseMBB);
36449 F->insert(It, SinkMBB);
36450
36451 // Set the call frame size on entry to the new basic blocks.
36452 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
36453 FalseMBB->setCallFrameSize(CallFrameSize);
36454 SinkMBB->setCallFrameSize(CallFrameSize);
36455
36456 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36457 // live into the sink and copy blocks.
36458 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36459 if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36460 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
36461 FalseMBB->addLiveIn(X86::EFLAGS);
36462 SinkMBB->addLiveIn(X86::EFLAGS);
36463 }
36464
36465 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
36467 MachineBasicBlock::iterator(LastCMOV));
36468 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
36469 if (MI.isDebugInstr())
36470 SinkMBB->push_back(MI.removeFromParent());
36471
36472 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36473 SinkMBB->splice(SinkMBB->end(), ThisMBB,
36474 std::next(MachineBasicBlock::iterator(LastCMOV)),
36475 ThisMBB->end());
36476 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36477
36478 // Fallthrough block for ThisMBB.
36479 ThisMBB->addSuccessor(FalseMBB);
36480 // The true block target of the first (or only) branch is always a SinkMBB.
36481 ThisMBB->addSuccessor(SinkMBB);
36482 // Fallthrough block for FalseMBB.
36483 FalseMBB->addSuccessor(SinkMBB);
36484
36485 // Create the conditional branch instruction.
36486 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
36487
36488 // SinkMBB:
36489 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
36490 // ...
36493 std::next(MachineBasicBlock::iterator(LastCMOV));
36494 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
36495
36496 // Now remove the CMOV(s).
36497 ThisMBB->erase(MIItBegin, MIItEnd);
36498
36499 return SinkMBB;
36500}
36501
36502static unsigned getSUBriOpcode(bool IsLP64) {
36503 if (IsLP64)
36504 return X86::SUB64ri32;
36505 else
36506 return X86::SUB32ri;
36507}
36508
36510X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
36511 MachineBasicBlock *MBB) const {
36512 MachineFunction *MF = MBB->getParent();
36513 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36514 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
36515 const MIMetadata MIMD(MI);
36516 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36517
36518 const unsigned ProbeSize = getStackProbeSize(*MF);
36519
36520 MachineRegisterInfo &MRI = MF->getRegInfo();
36521 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36522 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36523 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36524
36526 MF->insert(MBBIter, testMBB);
36527 MF->insert(MBBIter, blockMBB);
36528 MF->insert(MBBIter, tailMBB);
36529
36530 Register sizeVReg = MI.getOperand(1).getReg();
36531
36532 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
36533
36534 Register TmpStackPtr = MRI.createVirtualRegister(
36535 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36536 Register FinalStackPtr = MRI.createVirtualRegister(
36537 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36538
36539 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
36540 .addReg(physSPReg);
36541 {
36542 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
36543 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
36544 .addReg(TmpStackPtr)
36545 .addReg(sizeVReg);
36546 }
36547
36548 // test rsp size
36549
36550 BuildMI(testMBB, MIMD,
36551 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
36552 .addReg(FinalStackPtr)
36553 .addReg(physSPReg);
36554
36555 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
36556 .addMBB(tailMBB)
36558 testMBB->addSuccessor(blockMBB);
36559 testMBB->addSuccessor(tailMBB);
36560
36561 // Touch the block then extend it. This is done on the opposite side of
36562 // static probe where we allocate then touch, to avoid the need of probing the
36563 // tail of the static alloca. Possible scenarios are:
36564 //
36565 // + ---- <- ------------ <- ------------- <- ------------ +
36566 // | |
36567 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
36568 // | |
36569 // + <- ----------- <- ------------ <- ----------- <- ------------ +
36570 //
36571 // The property we want to enforce is to never have more than [page alloc] between two probes.
36572
36573 const unsigned XORMIOpc =
36574 TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
36575 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
36576 .addImm(0);
36577
36578 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
36579 physSPReg)
36580 .addReg(physSPReg)
36581 .addImm(ProbeSize);
36582
36583 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
36584 blockMBB->addSuccessor(testMBB);
36585
36586 // Replace original instruction by the expected stack ptr
36587 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
36588 MI.getOperand(0).getReg())
36589 .addReg(FinalStackPtr);
36590
36591 tailMBB->splice(tailMBB->end(), MBB,
36592 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36594 MBB->addSuccessor(testMBB);
36595
36596 // Delete the original pseudo instruction.
36597 MI.eraseFromParent();
36598
36599 // And we're done.
36600 return tailMBB;
36601}
36602
36604X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
36605 MachineBasicBlock *BB) const {
36606 MachineFunction *MF = BB->getParent();
36607 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36608 const MIMetadata MIMD(MI);
36609 const BasicBlock *LLVM_BB = BB->getBasicBlock();
36610
36611 assert(MF->shouldSplitStack());
36612
36613 const bool Is64Bit = Subtarget.is64Bit();
36614 const bool IsLP64 = Subtarget.isTarget64BitLP64();
36615
36616 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
36617 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
36618
36619 // BB:
36620 // ... [Till the alloca]
36621 // If stacklet is not large enough, jump to mallocMBB
36622 //
36623 // bumpMBB:
36624 // Allocate by subtracting from RSP
36625 // Jump to continueMBB
36626 //
36627 // mallocMBB:
36628 // Allocate by call to runtime
36629 //
36630 // continueMBB:
36631 // ...
36632 // [rest of original BB]
36633 //
36634
36635 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36636 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36637 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36638
36639 MachineRegisterInfo &MRI = MF->getRegInfo();
36640 const TargetRegisterClass *AddrRegClass =
36642
36643 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36644 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36645 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
36646 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
36647 sizeVReg = MI.getOperand(1).getReg(),
36648 physSPReg = IsLP64 ? X86::RSP : X86::ESP;
36649
36650 MachineFunction::iterator MBBIter = ++BB->getIterator();
36651
36652 MF->insert(MBBIter, bumpMBB);
36653 MF->insert(MBBIter, mallocMBB);
36654 MF->insert(MBBIter, continueMBB);
36655
36656 continueMBB->splice(continueMBB->begin(), BB,
36657 std::next(MachineBasicBlock::iterator(MI)), BB->end());
36658 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
36659
36660 // Add code to the main basic block to check if the stack limit has been hit,
36661 // and if so, jump to mallocMBB otherwise to bumpMBB.
36662 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
36663 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
36664 .addReg(tmpSPVReg).addReg(sizeVReg);
36665 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
36666 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
36667 .addReg(SPLimitVReg);
36668 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
36669
36670 // bumpMBB simply decreases the stack pointer, since we know the current
36671 // stacklet has enough space.
36672 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
36673 .addReg(SPLimitVReg);
36674 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
36675 .addReg(SPLimitVReg);
36676 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36677
36678 // Calls into a routine in libgcc to allocate more space from the heap.
36679 const uint32_t *RegMask =
36680 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
36681 if (IsLP64) {
36682 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
36683 .addReg(sizeVReg);
36684 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36685 .addExternalSymbol("__morestack_allocate_stack_space")
36686 .addRegMask(RegMask)
36687 .addReg(X86::RDI, RegState::Implicit)
36688 .addReg(X86::RAX, RegState::ImplicitDefine);
36689 } else if (Is64Bit) {
36690 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
36691 .addReg(sizeVReg);
36692 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36693 .addExternalSymbol("__morestack_allocate_stack_space")
36694 .addRegMask(RegMask)
36695 .addReg(X86::EDI, RegState::Implicit)
36696 .addReg(X86::EAX, RegState::ImplicitDefine);
36697 } else {
36698 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
36699 .addImm(12);
36700 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
36701 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
36702 .addExternalSymbol("__morestack_allocate_stack_space")
36703 .addRegMask(RegMask)
36704 .addReg(X86::EAX, RegState::ImplicitDefine);
36705 }
36706
36707 if (!Is64Bit)
36708 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
36709 .addImm(16);
36710
36711 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
36712 .addReg(IsLP64 ? X86::RAX : X86::EAX);
36713 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36714
36715 // Set up the CFG correctly.
36716 BB->addSuccessor(bumpMBB);
36717 BB->addSuccessor(mallocMBB);
36718 mallocMBB->addSuccessor(continueMBB);
36719 bumpMBB->addSuccessor(continueMBB);
36720
36721 // Take care of the PHI nodes.
36722 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
36723 MI.getOperand(0).getReg())
36724 .addReg(mallocPtrVReg)
36725 .addMBB(mallocMBB)
36726 .addReg(bumpSPPtrVReg)
36727 .addMBB(bumpMBB);
36728
36729 // Delete the original pseudo instruction.
36730 MI.eraseFromParent();
36731
36732 // And we're done.
36733 return continueMBB;
36734}
36735
36737X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
36738 MachineBasicBlock *BB) const {
36739 MachineFunction *MF = BB->getParent();
36740 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
36741 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
36742 const MIMetadata MIMD(MI);
36743
36746 "SEH does not use catchret!");
36747
36748 // Only 32-bit EH needs to worry about manually restoring stack pointers.
36749 if (!Subtarget.is32Bit())
36750 return BB;
36751
36752 // C++ EH creates a new target block to hold the restore code, and wires up
36753 // the new block to the return destination with a normal JMP_4.
36754 MachineBasicBlock *RestoreMBB =
36756 assert(BB->succ_size() == 1);
36757 MF->insert(std::next(BB->getIterator()), RestoreMBB);
36758 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
36759 BB->addSuccessor(RestoreMBB);
36760 MI.getOperand(0).setMBB(RestoreMBB);
36761
36762 // Marking this as an EH pad but not a funclet entry block causes PEI to
36763 // restore stack pointers in the block.
36764 RestoreMBB->setIsEHPad(true);
36765
36766 auto RestoreMBBI = RestoreMBB->begin();
36767 BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
36768 return BB;
36769}
36770
36772X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
36773 MachineBasicBlock *BB) const {
36774 // This is pretty easy. We're taking the value that we received from
36775 // our load from the relocation, sticking it in either RDI (x86-64)
36776 // or EAX and doing an indirect call. The return value will then
36777 // be in the normal return register.
36778 MachineFunction *F = BB->getParent();
36779 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36780 const MIMetadata MIMD(MI);
36781
36782 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
36783 assert(MI.getOperand(3).isGlobal() && "This should be a global");
36784
36785 // Get a register mask for the lowered call.
36786 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
36787 // proper register mask.
36788 const uint32_t *RegMask =
36789 Subtarget.is64Bit() ?
36790 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
36791 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
36792 if (Subtarget.is64Bit()) {
36793 MachineInstrBuilder MIB =
36794 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
36795 .addReg(X86::RIP)
36796 .addImm(0)
36797 .addReg(0)
36798 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36799 MI.getOperand(3).getTargetFlags())
36800 .addReg(0);
36801 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
36802 addDirectMem(MIB, X86::RDI);
36803 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
36804 } else if (!isPositionIndependent()) {
36805 MachineInstrBuilder MIB =
36806 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36807 .addReg(0)
36808 .addImm(0)
36809 .addReg(0)
36810 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36811 MI.getOperand(3).getTargetFlags())
36812 .addReg(0);
36813 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36814 addDirectMem(MIB, X86::EAX);
36815 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36816 } else {
36817 MachineInstrBuilder MIB =
36818 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36819 .addReg(TII->getGlobalBaseReg(F))
36820 .addImm(0)
36821 .addReg(0)
36822 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36823 MI.getOperand(3).getTargetFlags())
36824 .addReg(0);
36825 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36826 addDirectMem(MIB, X86::EAX);
36827 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36828 }
36829
36830 MI.eraseFromParent(); // The pseudo instruction is gone now.
36831 return BB;
36832}
36833
36834static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
36835 switch (RPOpc) {
36836 case X86::INDIRECT_THUNK_CALL32:
36837 return X86::CALLpcrel32;
36838 case X86::INDIRECT_THUNK_CALL64:
36839 return X86::CALL64pcrel32;
36840 case X86::INDIRECT_THUNK_TCRETURN32:
36841 return X86::TCRETURNdi;
36842 case X86::INDIRECT_THUNK_TCRETURN64:
36843 return X86::TCRETURNdi64;
36844 }
36845 llvm_unreachable("not indirect thunk opcode");
36846}
36847
36848static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
36849 Register Reg) {
36850 if (Subtarget.useRetpolineExternalThunk()) {
36851 // When using an external thunk for retpolines, we pick names that match the
36852 // names GCC happens to use as well. This helps simplify the implementation
36853 // of the thunks for kernels where they have no easy ability to create
36854 // aliases and are doing non-trivial configuration of the thunk's body. For
36855 // example, the Linux kernel will do boot-time hot patching of the thunk
36856 // bodies and cannot easily export aliases of these to loaded modules.
36857 //
36858 // Note that at any point in the future, we may need to change the semantics
36859 // of how we implement retpolines and at that time will likely change the
36860 // name of the called thunk. Essentially, there is no hard guarantee that
36861 // LLVM will generate calls to specific thunks, we merely make a best-effort
36862 // attempt to help out kernels and other systems where duplicating the
36863 // thunks is costly.
36864 switch (Reg.id()) {
36865 case X86::EAX:
36866 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36867 return "__x86_indirect_thunk_eax";
36868 case X86::ECX:
36869 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36870 return "__x86_indirect_thunk_ecx";
36871 case X86::EDX:
36872 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36873 return "__x86_indirect_thunk_edx";
36874 case X86::EDI:
36875 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36876 return "__x86_indirect_thunk_edi";
36877 case X86::R11:
36878 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36879 return "__x86_indirect_thunk_r11";
36880 }
36881 llvm_unreachable("unexpected reg for external indirect thunk");
36882 }
36883
36884 if (Subtarget.useRetpolineIndirectCalls() ||
36885 Subtarget.useRetpolineIndirectBranches()) {
36886 // When targeting an internal COMDAT thunk use an LLVM-specific name.
36887 switch (Reg.id()) {
36888 case X86::EAX:
36889 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36890 return "__llvm_retpoline_eax";
36891 case X86::ECX:
36892 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36893 return "__llvm_retpoline_ecx";
36894 case X86::EDX:
36895 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36896 return "__llvm_retpoline_edx";
36897 case X86::EDI:
36898 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36899 return "__llvm_retpoline_edi";
36900 case X86::R11:
36901 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36902 return "__llvm_retpoline_r11";
36903 }
36904 llvm_unreachable("unexpected reg for retpoline");
36905 }
36906
36907 if (Subtarget.useLVIControlFlowIntegrity()) {
36908 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36909 return "__llvm_lvi_thunk_r11";
36910 }
36911 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
36912}
36913
36915X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
36916 MachineBasicBlock *BB) const {
36917 // Copy the virtual register into the R11 physical register and
36918 // call the retpoline thunk.
36919 const MIMetadata MIMD(MI);
36920 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36921 Register CalleeVReg = MI.getOperand(0).getReg();
36922 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
36923
36924 // Find an available scratch register to hold the callee. On 64-bit, we can
36925 // just use R11, but we scan for uses anyway to ensure we don't generate
36926 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
36927 // already a register use operand to the call to hold the callee. If none
36928 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
36929 // register and ESI is the base pointer to realigned stack frames with VLAs.
36930 SmallVector<Register, 3> AvailableRegs;
36931 if (Subtarget.is64Bit())
36932 AvailableRegs.push_back(X86::R11);
36933 else
36934 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
36935
36936 // Zero out any registers that are already used.
36937 for (const auto &MO : MI.operands()) {
36938 if (MO.isReg() && MO.isUse())
36939 llvm::replace(AvailableRegs, MO.getReg(), Register());
36940 }
36941
36942 // Choose the first remaining non-zero available register.
36943 Register AvailableReg;
36944 for (Register MaybeReg : AvailableRegs) {
36945 if (MaybeReg) {
36946 AvailableReg = MaybeReg;
36947 break;
36948 }
36949 }
36950 if (!AvailableReg)
36951 report_fatal_error("calling convention incompatible with retpoline, no "
36952 "available registers");
36953
36954 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
36955
36956 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
36957 .addReg(CalleeVReg);
36958 MI.getOperand(0).ChangeToES(Symbol);
36959 MI.setDesc(TII->get(Opc));
36960 MachineInstrBuilder(*BB->getParent(), &MI)
36961 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
36962 return BB;
36963}
36964
36965/// SetJmp implies future control flow change upon calling the corresponding
36966/// LongJmp.
36967/// Instead of using the 'return' instruction, the long jump fixes the stack and
36968/// performs an indirect branch. To do so it uses the registers that were stored
36969/// in the jump buffer (when calling SetJmp).
36970/// In case the shadow stack is enabled we need to fix it as well, because some
36971/// return addresses will be skipped.
36972/// The function will save the SSP for future fixing in the function
36973/// emitLongJmpShadowStackFix.
36974/// \sa emitLongJmpShadowStackFix
36975/// \param [in] MI The temporary Machine Instruction for the builtin.
36976/// \param [in] MBB The Machine Basic Block that will be modified.
36977void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
36978 MachineBasicBlock *MBB) const {
36979 const MIMetadata MIMD(MI);
36980 MachineFunction *MF = MBB->getParent();
36981 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36982 MachineRegisterInfo &MRI = MF->getRegInfo();
36983 MachineInstrBuilder MIB;
36984
36985 // Memory Reference.
36986 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
36987
36988 // Initialize a register with zero.
36989 MVT PVT = getPointerTy(MF->getDataLayout());
36990 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
36991 Register ZReg = MRI.createVirtualRegister(PtrRC);
36992 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
36993 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
36994 .addDef(ZReg)
36995 .addReg(ZReg, RegState::Undef)
36996 .addReg(ZReg, RegState::Undef);
36997
36998 // Read the current SSP Register value to the zeroed register.
36999 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37000 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37001 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37002
37003 // Write the SSP register value to offset 3 in input memory buffer.
37004 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37005 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
37006 const int64_t SSPOffset = 3 * PVT.getStoreSize();
37007 const unsigned MemOpndSlot = 1;
37008 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37009 if (i == X86::AddrDisp)
37010 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
37011 else
37012 MIB.add(MI.getOperand(MemOpndSlot + i));
37013 }
37014 MIB.addReg(SSPCopyReg);
37015 MIB.setMemRefs(MMOs);
37016}
37017
37019X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
37020 MachineBasicBlock *MBB) const {
37021 const MIMetadata MIMD(MI);
37022 MachineFunction *MF = MBB->getParent();
37023 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37024 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
37025 MachineRegisterInfo &MRI = MF->getRegInfo();
37026
37027 const BasicBlock *BB = MBB->getBasicBlock();
37029
37030 // Memory Reference
37031 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37032
37033 unsigned MemOpndSlot = 0;
37034
37035 unsigned CurOp = 0;
37036
37037 Register DstReg = MI.getOperand(CurOp++).getReg();
37038 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
37039 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
37040 (void)TRI;
37041 Register mainDstReg = MRI.createVirtualRegister(RC);
37042 Register restoreDstReg = MRI.createVirtualRegister(RC);
37043
37044 MemOpndSlot = CurOp;
37045
37046 MVT PVT = getPointerTy(MF->getDataLayout());
37047 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37048 "Invalid Pointer Size!");
37049
37050 // For v = setjmp(buf), we generate
37051 //
37052 // thisMBB:
37053 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
37054 // SjLjSetup restoreMBB
37055 //
37056 // mainMBB:
37057 // v_main = 0
37058 //
37059 // sinkMBB:
37060 // v = phi(main, restore)
37061 //
37062 // restoreMBB:
37063 // if base pointer being used, load it from frame
37064 // v_restore = 1
37065
37066 MachineBasicBlock *thisMBB = MBB;
37067 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
37068 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37069 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
37070 MF->insert(I, mainMBB);
37071 MF->insert(I, sinkMBB);
37072 MF->push_back(restoreMBB);
37073 restoreMBB->setMachineBlockAddressTaken();
37074
37075 MachineInstrBuilder MIB;
37076
37077 // Transfer the remainder of BB and its successor edges to sinkMBB.
37078 sinkMBB->splice(sinkMBB->begin(), MBB,
37079 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
37081
37082 // thisMBB:
37083 unsigned PtrStoreOpc = 0;
37084 Register LabelReg;
37085 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37086 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37088
37089 // Prepare IP either in reg or imm.
37090 if (!UseImmLabel) {
37091 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37092 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37093 LabelReg = MRI.createVirtualRegister(PtrRC);
37094 if (Subtarget.is64Bit()) {
37095 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
37096 .addReg(X86::RIP)
37097 .addImm(0)
37098 .addReg(0)
37099 .addMBB(restoreMBB)
37100 .addReg(0);
37101 } else {
37102 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
37103 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
37104 .addReg(XII->getGlobalBaseReg(MF))
37105 .addImm(0)
37106 .addReg(0)
37107 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
37108 .addReg(0);
37109 }
37110 } else
37111 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37112 // Store IP
37113 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
37114 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37115 if (i == X86::AddrDisp)
37116 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
37117 else
37118 MIB.add(MI.getOperand(MemOpndSlot + i));
37119 }
37120 if (!UseImmLabel)
37121 MIB.addReg(LabelReg);
37122 else
37123 MIB.addMBB(restoreMBB);
37124 MIB.setMemRefs(MMOs);
37125
37126 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37127 emitSetJmpShadowStackFix(MI, thisMBB);
37128 }
37129
37130 // Setup
37131 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
37132 .addMBB(restoreMBB);
37133
37134 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37135 MIB.addRegMask(RegInfo->getNoPreservedMask());
37136 thisMBB->addSuccessor(mainMBB);
37137 thisMBB->addSuccessor(restoreMBB);
37138
37139 // mainMBB:
37140 // EAX = 0
37141 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
37142 mainMBB->addSuccessor(sinkMBB);
37143
37144 // sinkMBB:
37145 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
37146 .addReg(mainDstReg)
37147 .addMBB(mainMBB)
37148 .addReg(restoreDstReg)
37149 .addMBB(restoreMBB);
37150
37151 // restoreMBB:
37152 if (RegInfo->hasBasePointer(*MF)) {
37153 const bool Uses64BitFramePtr = Subtarget.isTarget64BitLP64();
37154 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
37155 X86FI->setRestoreBasePointer(MF);
37156 Register FramePtr = RegInfo->getFrameRegister(*MF);
37157 Register BasePtr = RegInfo->getBaseRegister();
37158 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
37159 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
37160 FramePtr, true, X86FI->getRestoreBasePointerOffset())
37162 }
37163 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
37164 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
37165 restoreMBB->addSuccessor(sinkMBB);
37166
37167 MI.eraseFromParent();
37168 return sinkMBB;
37169}
37170
37171/// Fix the shadow stack using the previously saved SSP pointer.
37172/// \sa emitSetJmpShadowStackFix
37173/// \param [in] MI The temporary Machine Instruction for the builtin.
37174/// \param [in] MBB The Machine Basic Block that will be modified.
37175/// \return The sink MBB that will perform the future indirect branch.
37177X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
37178 MachineBasicBlock *MBB) const {
37179 const MIMetadata MIMD(MI);
37180 MachineFunction *MF = MBB->getParent();
37181 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37182 MachineRegisterInfo &MRI = MF->getRegInfo();
37183
37184 // Memory Reference
37185 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37186
37187 MVT PVT = getPointerTy(MF->getDataLayout());
37188 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37189
37190 // checkSspMBB:
37191 // xor vreg1, vreg1
37192 // rdssp vreg1
37193 // test vreg1, vreg1
37194 // je sinkMBB # Jump if Shadow Stack is not supported
37195 // fallMBB:
37196 // mov buf+24/12(%rip), vreg2
37197 // sub vreg1, vreg2
37198 // jbe sinkMBB # No need to fix the Shadow Stack
37199 // fixShadowMBB:
37200 // shr 3/2, vreg2
37201 // incssp vreg2 # fix the SSP according to the lower 8 bits
37202 // shr 8, vreg2
37203 // je sinkMBB
37204 // fixShadowLoopPrepareMBB:
37205 // shl vreg2
37206 // mov 128, vreg3
37207 // fixShadowLoopMBB:
37208 // incssp vreg3
37209 // dec vreg2
37210 // jne fixShadowLoopMBB # Iterate until you finish fixing
37211 // # the Shadow Stack
37212 // sinkMBB:
37213
37215 const BasicBlock *BB = MBB->getBasicBlock();
37216
37217 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
37218 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
37219 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
37220 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
37221 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
37222 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37223 MF->insert(I, checkSspMBB);
37224 MF->insert(I, fallMBB);
37225 MF->insert(I, fixShadowMBB);
37226 MF->insert(I, fixShadowLoopPrepareMBB);
37227 MF->insert(I, fixShadowLoopMBB);
37228 MF->insert(I, sinkMBB);
37229
37230 // Transfer the remainder of BB and its successor edges to sinkMBB.
37231 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
37232 MBB->end());
37234
37235 MBB->addSuccessor(checkSspMBB);
37236
37237 // Initialize a register with zero.
37238 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
37239 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
37240
37241 if (PVT == MVT::i64) {
37242 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
37243 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
37244 .addImm(0)
37245 .addReg(ZReg)
37246 .addImm(X86::sub_32bit);
37247 ZReg = TmpZReg;
37248 }
37249
37250 // Read the current SSP Register value to the zeroed register.
37251 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37252 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37253 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37254
37255 // Check whether the result of the SSP register is zero and jump directly
37256 // to the sink.
37257 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
37258 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
37259 .addReg(SSPCopyReg)
37260 .addReg(SSPCopyReg);
37261 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
37262 .addMBB(sinkMBB)
37264 checkSspMBB->addSuccessor(sinkMBB);
37265 checkSspMBB->addSuccessor(fallMBB);
37266
37267 // Reload the previously saved SSP register value.
37268 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
37269 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37270 const int64_t SPPOffset = 3 * PVT.getStoreSize();
37271 MachineInstrBuilder MIB =
37272 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
37273 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37274 const MachineOperand &MO = MI.getOperand(i);
37275 if (i == X86::AddrDisp)
37276 MIB.addDisp(MO, SPPOffset);
37277 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37278 // preserve kill flags.
37279 MIB.addReg(MO.getReg());
37280 else
37281 MIB.add(MO);
37282 }
37283 MIB.setMemRefs(MMOs);
37284
37285 // Subtract the current SSP from the previous SSP.
37286 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
37287 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
37288 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
37289 .addReg(PrevSSPReg)
37290 .addReg(SSPCopyReg);
37291
37292 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
37293 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
37294 .addMBB(sinkMBB)
37296 fallMBB->addSuccessor(sinkMBB);
37297 fallMBB->addSuccessor(fixShadowMBB);
37298
37299 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
37300 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
37301 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
37302 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
37303 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
37304 .addReg(SspSubReg)
37305 .addImm(Offset);
37306
37307 // Increase SSP when looking only on the lower 8 bits of the delta.
37308 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
37309 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
37310
37311 // Reset the lower 8 bits.
37312 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
37313 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
37314 .addReg(SspFirstShrReg)
37315 .addImm(8);
37316
37317 // Jump if the result of the shift is zero.
37318 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
37319 .addMBB(sinkMBB)
37321 fixShadowMBB->addSuccessor(sinkMBB);
37322 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
37323
37324 // Do a single shift left.
37325 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
37326 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
37327 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
37328 .addReg(SspSecondShrReg)
37329 .addImm(1);
37330
37331 // Save the value 128 to a register (will be used next with incssp).
37332 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
37333 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
37334 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
37335 .addImm(128);
37336 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
37337
37338 // Since incssp only looks at the lower 8 bits, we might need to do several
37339 // iterations of incssp until we finish fixing the shadow stack.
37340 Register DecReg = MRI.createVirtualRegister(PtrRC);
37341 Register CounterReg = MRI.createVirtualRegister(PtrRC);
37342 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
37343 .addReg(SspAfterShlReg)
37344 .addMBB(fixShadowLoopPrepareMBB)
37345 .addReg(DecReg)
37346 .addMBB(fixShadowLoopMBB);
37347
37348 // Every iteration we increase the SSP by 128.
37349 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
37350
37351 // Every iteration we decrement the counter by 1.
37352 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
37353 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
37354
37355 // Jump if the counter is not zero yet.
37356 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
37357 .addMBB(fixShadowLoopMBB)
37359 fixShadowLoopMBB->addSuccessor(sinkMBB);
37360 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
37361
37362 return sinkMBB;
37363}
37364
37366X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
37367 MachineBasicBlock *MBB) const {
37368 const MIMetadata MIMD(MI);
37369 MachineFunction *MF = MBB->getParent();
37370 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37371 MachineRegisterInfo &MRI = MF->getRegInfo();
37372
37373 // Memory Reference
37374 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37375
37376 MVT PVT = getPointerTy(MF->getDataLayout());
37377 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37378 "Invalid Pointer Size!");
37379
37380 const TargetRegisterClass *RC =
37381 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37382 Register Tmp = MRI.createVirtualRegister(RC);
37383 // Since FP is only updated here but NOT referenced, it's treated as GPR.
37384 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37385 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
37386 Register SP = RegInfo->getStackRegister();
37387
37388 MachineInstrBuilder MIB;
37389
37390 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37391 const int64_t SPOffset = 2 * PVT.getStoreSize();
37392
37393 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37394 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
37395
37396 MachineBasicBlock *thisMBB = MBB;
37397
37398 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
37399 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37400 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
37401 }
37402
37403 // Reload FP
37404 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
37405 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37406 const MachineOperand &MO = MI.getOperand(i);
37407 if (MO.isReg()) // Don't add the whole operand, we don't want to
37408 // preserve kill flags.
37409 MIB.addReg(MO.getReg());
37410 else
37411 MIB.add(MO);
37412 }
37413 MIB.setMemRefs(MMOs);
37415
37416 // Reload IP
37417 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
37418 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37419 const MachineOperand &MO = MI.getOperand(i);
37420 if (i == X86::AddrDisp)
37421 MIB.addDisp(MO, LabelOffset);
37422 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37423 // preserve kill flags.
37424 MIB.addReg(MO.getReg());
37425 else
37426 MIB.add(MO);
37427 }
37428 MIB.setMemRefs(MMOs);
37429
37430 // Reload SP
37431 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
37432 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37433 if (i == X86::AddrDisp)
37434 MIB.addDisp(MI.getOperand(i), SPOffset);
37435 else
37436 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
37437 // the last instruction of the expansion.
37438 }
37439 MIB.setMemRefs(MMOs);
37441
37442 // Jump
37443 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
37444
37445 MI.eraseFromParent();
37446 return thisMBB;
37447}
37448
37449void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
37451 MachineBasicBlock *DispatchBB,
37452 int FI) const {
37453 const MIMetadata MIMD(MI);
37454 MachineFunction *MF = MBB->getParent();
37455 MachineRegisterInfo *MRI = &MF->getRegInfo();
37456 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37457
37458 MVT PVT = getPointerTy(MF->getDataLayout());
37459 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
37460
37461 unsigned Op = 0;
37462 Register VR;
37463
37464 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37466
37467 if (UseImmLabel) {
37468 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37469 } else {
37470 const TargetRegisterClass *TRC =
37471 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37472 VR = MRI->createVirtualRegister(TRC);
37473 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37474
37475 if (Subtarget.is64Bit())
37476 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
37477 .addReg(X86::RIP)
37478 .addImm(1)
37479 .addReg(0)
37480 .addMBB(DispatchBB)
37481 .addReg(0);
37482 else
37483 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
37484 .addReg(0) /* TII->getGlobalBaseReg(MF) */
37485 .addImm(1)
37486 .addReg(0)
37487 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
37488 .addReg(0);
37489 }
37490
37491 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
37492 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
37493 if (UseImmLabel)
37494 MIB.addMBB(DispatchBB);
37495 else
37496 MIB.addReg(VR);
37497}
37498
37500X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
37501 MachineBasicBlock *BB) const {
37502 const MIMetadata MIMD(MI);
37503 MachineFunction *MF = BB->getParent();
37504 MachineRegisterInfo *MRI = &MF->getRegInfo();
37505 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37506 int FI = MF->getFrameInfo().getFunctionContextIndex();
37507
37508 // Get a mapping of the call site numbers to all of the landing pads they're
37509 // associated with.
37510 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
37511 unsigned MaxCSNum = 0;
37512 for (auto &MBB : *MF) {
37513 if (!MBB.isEHPad())
37514 continue;
37515
37516 MCSymbol *Sym = nullptr;
37517 for (const auto &MI : MBB) {
37518 if (MI.isDebugInstr())
37519 continue;
37520
37521 assert(MI.isEHLabel() && "expected EH_LABEL");
37522 Sym = MI.getOperand(0).getMCSymbol();
37523 break;
37524 }
37525
37526 if (!MF->hasCallSiteLandingPad(Sym))
37527 continue;
37528
37529 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
37530 CallSiteNumToLPad[CSI].push_back(&MBB);
37531 MaxCSNum = std::max(MaxCSNum, CSI);
37532 }
37533 }
37534
37535 // Get an ordered list of the machine basic blocks for the jump table.
37536 std::vector<MachineBasicBlock *> LPadList;
37537 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
37538 LPadList.reserve(CallSiteNumToLPad.size());
37539
37540 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
37541 for (auto &LP : CallSiteNumToLPad[CSI]) {
37542 LPadList.push_back(LP);
37543 InvokeBBs.insert_range(LP->predecessors());
37544 }
37545 }
37546
37547 assert(!LPadList.empty() &&
37548 "No landing pad destinations for the dispatch jump table!");
37549
37550 // Create the MBBs for the dispatch code.
37551
37552 // Shove the dispatch's address into the return slot in the function context.
37553 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
37554 DispatchBB->setIsEHPad(true);
37555
37556 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
37557 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
37558 DispatchBB->addSuccessor(TrapBB);
37559
37560 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
37561 DispatchBB->addSuccessor(DispContBB);
37562
37563 // Insert MBBs.
37564 MF->push_back(DispatchBB);
37565 MF->push_back(DispContBB);
37566 MF->push_back(TrapBB);
37567
37568 // Insert code into the entry block that creates and registers the function
37569 // context.
37570 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
37571
37572 // Create the jump table and associated information
37573 unsigned JTE = getJumpTableEncoding();
37574 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
37575 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
37576
37577 const X86RegisterInfo &RI = TII->getRegisterInfo();
37578 // Add a register mask with no preserved registers. This results in all
37579 // registers being marked as clobbered.
37580 if (RI.hasBasePointer(*MF)) {
37581 const bool FPIs64Bit = Subtarget.isTarget64BitLP64();
37582 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
37583 MFI->setRestoreBasePointer(MF);
37584
37585 Register FP = RI.getFrameRegister(*MF);
37586 Register BP = RI.getBaseRegister();
37587 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
37588 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
37591 } else {
37592 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
37594 }
37595
37596 // IReg is used as an index in a memory operand and therefore can't be SP
37597 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
37598 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
37599 Subtarget.is64Bit() ? 8 : 4);
37600 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
37601 .addReg(IReg)
37602 .addImm(LPadList.size());
37603 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
37604 .addMBB(TrapBB)
37606
37607 if (Subtarget.is64Bit()) {
37608 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37609 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
37610
37611 // leaq .LJTI0_0(%rip), BReg
37612 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
37613 .addReg(X86::RIP)
37614 .addImm(1)
37615 .addReg(0)
37616 .addJumpTableIndex(MJTI)
37617 .addReg(0);
37618 // movzx IReg64, IReg
37619 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
37620 .addImm(0)
37621 .addReg(IReg)
37622 .addImm(X86::sub_32bit);
37623
37624 switch (JTE) {
37626 // jmpq *(BReg,IReg64,8)
37627 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
37628 .addReg(BReg)
37629 .addImm(8)
37630 .addReg(IReg64)
37631 .addImm(0)
37632 .addReg(0);
37633 break;
37635 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
37636 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
37637 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37638
37639 // movl (BReg,IReg64,4), OReg
37640 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
37641 .addReg(BReg)
37642 .addImm(4)
37643 .addReg(IReg64)
37644 .addImm(0)
37645 .addReg(0);
37646 // movsx OReg64, OReg
37647 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
37648 .addReg(OReg);
37649 // addq BReg, OReg64, TReg
37650 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
37651 .addReg(OReg64)
37652 .addReg(BReg);
37653 // jmpq *TReg
37654 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
37655 break;
37656 }
37657 default:
37658 llvm_unreachable("Unexpected jump table encoding");
37659 }
37660 } else {
37661 // jmpl *.LJTI0_0(,IReg,4)
37662 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
37663 .addReg(0)
37664 .addImm(4)
37665 .addReg(IReg)
37666 .addJumpTableIndex(MJTI)
37667 .addReg(0);
37668 }
37669
37670 // Add the jump table entries as successors to the MBB.
37671 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
37672 for (auto &LP : LPadList)
37673 if (SeenMBBs.insert(LP).second)
37674 DispContBB->addSuccessor(LP);
37675
37676 // N.B. the order the invoke BBs are processed in doesn't matter here.
37678 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
37679 for (MachineBasicBlock *MBB : InvokeBBs) {
37680 // Remove the landing pad successor from the invoke block and replace it
37681 // with the new dispatch block.
37682 // Keep a copy of Successors since it's modified inside the loop.
37683 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
37684 MBB->succ_rend());
37685 // FIXME: Avoid quadratic complexity.
37686 for (auto *MBBS : Successors) {
37687 if (MBBS->isEHPad()) {
37688 MBB->removeSuccessor(MBBS);
37689 MBBLPads.push_back(MBBS);
37690 }
37691 }
37692
37693 MBB->addSuccessor(DispatchBB);
37694
37695 // Find the invoke call and mark all of the callee-saved registers as
37696 // 'implicit defined' so that they're spilled. This prevents code from
37697 // moving instructions to before the EH block, where they will never be
37698 // executed.
37699 for (auto &II : reverse(*MBB)) {
37700 if (!II.isCall())
37701 continue;
37702
37703 DenseSet<Register> DefRegs;
37704 for (auto &MOp : II.operands())
37705 if (MOp.isReg())
37706 DefRegs.insert(MOp.getReg());
37707
37708 MachineInstrBuilder MIB(*MF, &II);
37709 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
37710 Register Reg = SavedRegs[RegIdx];
37711 if (!DefRegs.contains(Reg))
37713 }
37714
37715 break;
37716 }
37717 }
37718
37719 // Mark all former landing pads as non-landing pads. The dispatch is the only
37720 // landing pad now.
37721 for (auto &LP : MBBLPads)
37722 LP->setIsEHPad(false);
37723
37724 // The instruction is gone now.
37725 MI.eraseFromParent();
37726 return BB;
37727}
37728
37730X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
37731 MachineBasicBlock *BB) const {
37732 // Wrap patchable event calls in CALLSEQ_START/CALLSEQ_END, as tracing
37733 // calls may require proper stack alignment.
37734 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
37735 const MIMetadata MIMD(MI);
37736 MachineFunction &MF = *BB->getParent();
37737
37738 // Emit CALLSEQ_START right before the instruction.
37739 MF.getFrameInfo().setAdjustsStack(true);
37740 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
37741 MachineInstrBuilder CallseqStart =
37742 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
37743 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
37744
37745 // Emit CALLSEQ_END right after the instruction.
37746 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
37747 MachineInstrBuilder CallseqEnd =
37748 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
37749 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
37750
37751 return BB;
37752}
37753
37756 MachineBasicBlock *BB) const {
37757 MachineFunction *MF = BB->getParent();
37758 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37759 const MIMetadata MIMD(MI);
37760
37761 auto TMMImmToTMMReg = [](unsigned Imm) {
37762 assert (Imm < 8 && "Illegal tmm index");
37763 return X86::TMM0 + Imm;
37764 };
37765 auto TMMImmToTMMPair = [](unsigned Imm) {
37766 assert(Imm < 8 && "Illegal tmm pair index.");
37767 return X86::TMM0_TMM1 + Imm / 2;
37768 };
37769 switch (MI.getOpcode()) {
37770 default:
37771 llvm_unreachable("Unexpected instr type to insert");
37772 case X86::INDIRECT_THUNK_CALL32:
37773 case X86::INDIRECT_THUNK_CALL64:
37774 case X86::INDIRECT_THUNK_TCRETURN32:
37775 case X86::INDIRECT_THUNK_TCRETURN64:
37776 return EmitLoweredIndirectThunk(MI, BB);
37777 case X86::CATCHRET:
37778 return EmitLoweredCatchRet(MI, BB);
37779 case X86::SEG_ALLOCA_32:
37780 case X86::SEG_ALLOCA_64:
37781 return EmitLoweredSegAlloca(MI, BB);
37782 case X86::PROBED_ALLOCA_32:
37783 case X86::PROBED_ALLOCA_64:
37784 return EmitLoweredProbedAlloca(MI, BB);
37785 case X86::TLSCall_32:
37786 case X86::TLSCall_64:
37787 return EmitLoweredTLSCall(MI, BB);
37788 case X86::CMOV_FR16:
37789 case X86::CMOV_FR16X:
37790 case X86::CMOV_FR32:
37791 case X86::CMOV_FR32X:
37792 case X86::CMOV_FR64:
37793 case X86::CMOV_FR64X:
37794 case X86::CMOV_GR8:
37795 case X86::CMOV_GR16:
37796 case X86::CMOV_GR32:
37797 case X86::CMOV_RFP32:
37798 case X86::CMOV_RFP64:
37799 case X86::CMOV_RFP80:
37800 case X86::CMOV_VR64:
37801 case X86::CMOV_VR128:
37802 case X86::CMOV_VR128X:
37803 case X86::CMOV_VR256:
37804 case X86::CMOV_VR256X:
37805 case X86::CMOV_VR512:
37806 case X86::CMOV_VK1:
37807 case X86::CMOV_VK2:
37808 case X86::CMOV_VK4:
37809 case X86::CMOV_VK8:
37810 case X86::CMOV_VK16:
37811 case X86::CMOV_VK32:
37812 case X86::CMOV_VK64:
37813 return EmitLoweredSelect(MI, BB);
37814
37815 case X86::FP80_ADDr:
37816 case X86::FP80_ADDm32: {
37817 // Change the floating point control register to use double extended
37818 // precision when performing the addition.
37819 int OrigCWFrameIdx =
37820 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37821 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37822 OrigCWFrameIdx);
37823
37824 // Load the old value of the control word...
37825 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37826 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37827 OrigCWFrameIdx);
37828
37829 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
37830 // precision.
37831 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37832 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37833 .addReg(OldCW, RegState::Kill)
37834 .addImm(0x300);
37835
37836 // Extract to 16 bits.
37837 Register NewCW16 =
37838 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37839 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37840 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37841
37842 // Prepare memory for FLDCW.
37843 int NewCWFrameIdx =
37844 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37845 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37846 NewCWFrameIdx)
37847 .addReg(NewCW16, RegState::Kill);
37848
37849 // Reload the modified control word now...
37850 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37851 NewCWFrameIdx);
37852
37853 // Do the addition.
37854 if (MI.getOpcode() == X86::FP80_ADDr) {
37855 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
37856 .add(MI.getOperand(0))
37857 .add(MI.getOperand(1))
37858 .add(MI.getOperand(2));
37859 } else {
37860 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
37861 .add(MI.getOperand(0))
37862 .add(MI.getOperand(1))
37863 .add(MI.getOperand(2))
37864 .add(MI.getOperand(3))
37865 .add(MI.getOperand(4))
37866 .add(MI.getOperand(5))
37867 .add(MI.getOperand(6));
37868 }
37869
37870 // Reload the original control word now.
37871 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37872 OrigCWFrameIdx);
37873
37874 MI.eraseFromParent(); // The pseudo instruction is gone now.
37875 return BB;
37876 }
37877
37878 case X86::FP32_TO_INT16_IN_MEM:
37879 case X86::FP32_TO_INT32_IN_MEM:
37880 case X86::FP32_TO_INT64_IN_MEM:
37881 case X86::FP64_TO_INT16_IN_MEM:
37882 case X86::FP64_TO_INT32_IN_MEM:
37883 case X86::FP64_TO_INT64_IN_MEM:
37884 case X86::FP80_TO_INT16_IN_MEM:
37885 case X86::FP80_TO_INT32_IN_MEM:
37886 case X86::FP80_TO_INT64_IN_MEM: {
37887 // Change the floating point control register to use "round towards zero"
37888 // mode when truncating to an integer value.
37889 int OrigCWFrameIdx =
37890 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37891 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37892 OrigCWFrameIdx);
37893
37894 // Load the old value of the control word...
37895 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37896 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37897 OrigCWFrameIdx);
37898
37899 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
37900 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37901 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37902 .addReg(OldCW, RegState::Kill).addImm(0xC00);
37903
37904 // Extract to 16 bits.
37905 Register NewCW16 =
37906 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37907 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37908 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37909
37910 // Prepare memory for FLDCW.
37911 int NewCWFrameIdx =
37912 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37913 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37914 NewCWFrameIdx)
37915 .addReg(NewCW16, RegState::Kill);
37916
37917 // Reload the modified control word now...
37918 addFrameReference(BuildMI(*BB, MI, MIMD,
37919 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
37920
37921 // Get the X86 opcode to use.
37922 unsigned Opc;
37923 switch (MI.getOpcode()) {
37924 // clang-format off
37925 default: llvm_unreachable("illegal opcode!");
37926 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
37927 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
37928 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
37929 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
37930 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
37931 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
37932 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
37933 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
37934 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
37935 // clang-format on
37936 }
37937
37939 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
37940 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
37941
37942 // Reload the original control word now.
37943 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37944 OrigCWFrameIdx);
37945
37946 MI.eraseFromParent(); // The pseudo instruction is gone now.
37947 return BB;
37948 }
37949
37950 // xbegin
37951 case X86::XBEGIN:
37952 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
37953
37954 case X86::VAARG_64:
37955 case X86::VAARG_X32:
37956 return EmitVAARGWithCustomInserter(MI, BB);
37957
37958 case X86::EH_SjLj_SetJmp32:
37959 case X86::EH_SjLj_SetJmp64:
37960 return emitEHSjLjSetJmp(MI, BB);
37961
37962 case X86::EH_SjLj_LongJmp32:
37963 case X86::EH_SjLj_LongJmp64:
37964 return emitEHSjLjLongJmp(MI, BB);
37965
37966 case X86::Int_eh_sjlj_setup_dispatch:
37967 return EmitSjLjDispatchBlock(MI, BB);
37968
37969 case TargetOpcode::STATEPOINT:
37970 // As an implementation detail, STATEPOINT shares the STACKMAP format at
37971 // this point in the process. We diverge later.
37972 return emitPatchPoint(MI, BB);
37973
37974 case TargetOpcode::STACKMAP:
37975 case TargetOpcode::PATCHPOINT:
37976 return emitPatchPoint(MI, BB);
37977
37978 case TargetOpcode::PATCHABLE_EVENT_CALL:
37979 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
37980 return emitPatchableEventCall(MI, BB);
37981
37982 case X86::LCMPXCHG8B: {
37983 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
37984 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
37985 // requires a memory operand. If it happens that current architecture is
37986 // i686 and for current function we need a base pointer
37987 // - which is ESI for i686 - register allocator would not be able to
37988 // allocate registers for an address in form of X(%reg, %reg, Y)
37989 // - there never would be enough unreserved registers during regalloc
37990 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
37991 // We are giving a hand to register allocator by precomputing the address in
37992 // a new vreg using LEA.
37993
37994 // If it is not i686 or there is no base pointer - nothing to do here.
37995 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
37996 return BB;
37997
37998 // Even though this code does not necessarily needs the base pointer to
37999 // be ESI, we check for that. The reason: if this assert fails, there are
38000 // some changes happened in the compiler base pointer handling, which most
38001 // probably have to be addressed somehow here.
38002 assert(TRI->getBaseRegister() == X86::ESI &&
38003 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
38004 "base pointer in mind");
38005
38007 MVT SPTy = getPointerTy(MF->getDataLayout());
38008 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
38009 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
38010
38012 // Regalloc does not need any help when the memory operand of CMPXCHG8B
38013 // does not use index register.
38014 if (AM.IndexReg == X86::NoRegister)
38015 return BB;
38016
38017 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
38018 // four operand definitions that are E[ABCD] registers. We skip them and
38019 // then insert the LEA.
38020 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
38021 while (RMBBI != BB->rend() &&
38022 (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
38023 RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
38024 RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
38025 RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
38026 ++RMBBI;
38027 }
38030 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
38031
38032 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
38033
38034 return BB;
38035 }
38036 case X86::LCMPXCHG16B_NO_RBX: {
38037 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38038 Register BasePtr = TRI->getBaseRegister();
38039 if (TRI->hasBasePointer(*MF) &&
38040 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
38041 if (!BB->isLiveIn(BasePtr))
38042 BB->addLiveIn(BasePtr);
38043 // Save RBX into a virtual register.
38044 Register SaveRBX =
38045 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38046 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38047 .addReg(X86::RBX);
38048 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38050 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
38051 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38052 MIB.add(MI.getOperand(Idx));
38053 MIB.add(MI.getOperand(X86::AddrNumOperands));
38054 MIB.addReg(SaveRBX);
38055 } else {
38056 // Simple case, just copy the virtual register to RBX.
38057 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
38058 .add(MI.getOperand(X86::AddrNumOperands));
38060 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
38061 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38062 MIB.add(MI.getOperand(Idx));
38063 }
38064 MI.eraseFromParent();
38065 return BB;
38066 }
38067 case X86::MWAITX: {
38068 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38069 Register BasePtr = TRI->getBaseRegister();
38070 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
38071 // If no need to save the base pointer, we generate MWAITXrrr,
38072 // else we generate pseudo MWAITX_SAVE_RBX.
38073 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
38074 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38075 .addReg(MI.getOperand(0).getReg());
38076 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38077 .addReg(MI.getOperand(1).getReg());
38078 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
38079 .addReg(MI.getOperand(2).getReg());
38080 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
38081 MI.eraseFromParent();
38082 } else {
38083 if (!BB->isLiveIn(BasePtr)) {
38084 BB->addLiveIn(BasePtr);
38085 }
38086 // Parameters can be copied into ECX and EAX but not EBX yet.
38087 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38088 .addReg(MI.getOperand(0).getReg());
38089 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38090 .addReg(MI.getOperand(1).getReg());
38091 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
38092 // Save RBX into a virtual register.
38093 Register SaveRBX =
38094 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38095 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38096 .addReg(X86::RBX);
38097 // Generate mwaitx pseudo.
38098 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38099 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
38100 .addDef(Dst) // Destination tied in with SaveRBX.
38101 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
38102 .addUse(SaveRBX); // Save of base pointer.
38103 MI.eraseFromParent();
38104 }
38105 return BB;
38106 }
38107 case TargetOpcode::PREALLOCATED_SETUP: {
38108 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
38109 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38110 MFI->setHasPreallocatedCall(true);
38111 int64_t PreallocatedId = MI.getOperand(0).getImm();
38112 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
38113 assert(StackAdjustment != 0 && "0 stack adjustment");
38114 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
38115 << StackAdjustment << "\n");
38116 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
38117 .addReg(X86::ESP)
38118 .addImm(StackAdjustment);
38119 MI.eraseFromParent();
38120 return BB;
38121 }
38122 case TargetOpcode::PREALLOCATED_ARG: {
38123 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
38124 int64_t PreallocatedId = MI.getOperand(1).getImm();
38125 int64_t ArgIdx = MI.getOperand(2).getImm();
38126 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38127 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
38128 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
38129 << ", arg offset " << ArgOffset << "\n");
38130 // stack pointer + offset
38131 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
38132 MI.getOperand(0).getReg()),
38133 X86::ESP, false, ArgOffset);
38134 MI.eraseFromParent();
38135 return BB;
38136 }
38137 case X86::PTDPBSSD:
38138 case X86::PTDPBSUD:
38139 case X86::PTDPBUSD:
38140 case X86::PTDPBUUD:
38141 case X86::PTDPBF16PS:
38142 case X86::PTDPFP16PS:
38143 case X86::PTCMMIMFP16PS:
38144 case X86::PTCMMRLFP16PS:
38145 case X86::PTDPBF8PS:
38146 case X86::PTDPBHF8PS:
38147 case X86::PTDPHBF8PS:
38148 case X86::PTDPHF8PS:
38149 case X86::PTTDPBF16PS:
38150 case X86::PTTDPFP16PS:
38151 case X86::PTTCMMIMFP16PS:
38152 case X86::PTTCMMRLFP16PS:
38153 case X86::PTCONJTCMMIMFP16PS:
38154 case X86::PTMMULTF32PS:
38155 case X86::PTTMMULTF32PS: {
38156 unsigned Opc;
38157 switch (MI.getOpcode()) {
38158 default: llvm_unreachable("illegal opcode!");
38159 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
38160 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
38161 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
38162 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
38163 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
38164 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
38165 case X86::PTCMMIMFP16PS:
38166 Opc = X86::TCMMIMFP16PS;
38167 break;
38168 case X86::PTCMMRLFP16PS:
38169 Opc = X86::TCMMRLFP16PS;
38170 break;
38171 case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
38172 case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
38173 case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
38174 case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
38175 case X86::PTTDPBF16PS:
38176 Opc = X86::TTDPBF16PS;
38177 break;
38178 case X86::PTTDPFP16PS:
38179 Opc = X86::TTDPFP16PS;
38180 break;
38181 case X86::PTTCMMIMFP16PS:
38182 Opc = X86::TTCMMIMFP16PS;
38183 break;
38184 case X86::PTTCMMRLFP16PS:
38185 Opc = X86::TTCMMRLFP16PS;
38186 break;
38187 case X86::PTCONJTCMMIMFP16PS:
38188 Opc = X86::TCONJTCMMIMFP16PS;
38189 break;
38190 case X86::PTMMULTF32PS:
38191 Opc = X86::TMMULTF32PS;
38192 break;
38193 case X86::PTTMMULTF32PS:
38194 Opc = X86::TTMMULTF32PS;
38195 break;
38196 }
38197
38198 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38199 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38200 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
38201 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38202 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
38203
38204 MI.eraseFromParent(); // The pseudo is gone now.
38205 return BB;
38206 }
38207 case X86::PTILEZERO: {
38208 unsigned Imm = MI.getOperand(0).getImm();
38209 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
38210 MI.eraseFromParent(); // The pseudo is gone now.
38211 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38213 return BB;
38214 }
38215 case X86::PTILEZEROV: {
38216 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38218 return BB;
38219 }
38220 case X86::PTILELOADDRS:
38221 case X86::PTILELOADDRST1:
38222 case X86::PTILELOADD:
38223 case X86::PTILELOADDT1:
38224 case X86::PTILESTORED: {
38225 unsigned Opc;
38226 switch (MI.getOpcode()) {
38227 default: llvm_unreachable("illegal opcode!");
38228#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38229 case X86::PTILELOADD:
38230 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
38231 break;
38232 case X86::PTILELOADDT1:
38233 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
38234 break;
38235 case X86::PTILESTORED:
38236 Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
38237 break;
38238 case X86::PTILELOADDRS:
38239 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);
38240 break;
38241 case X86::PTILELOADDRST1:
38242 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);
38243 break;
38244 }
38245#undef GET_EGPR_IF_ENABLED
38246
38247 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38248 unsigned CurOp = 0;
38249 if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
38250 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38252
38253 MIB.add(MI.getOperand(CurOp++)); // base
38254 MIB.add(MI.getOperand(CurOp++)); // scale
38255 MIB.add(MI.getOperand(CurOp++)); // index -- stride
38256 MIB.add(MI.getOperand(CurOp++)); // displacement
38257 MIB.add(MI.getOperand(CurOp++)); // segment
38258
38259 if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
38260 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38262
38263 MI.eraseFromParent(); // The pseudo is gone now.
38264 return BB;
38265 }
38266 case X86::PT2RPNTLVWZ0:
38267 case X86::PT2RPNTLVWZ0T1:
38268 case X86::PT2RPNTLVWZ1:
38269 case X86::PT2RPNTLVWZ1T1:
38270 case X86::PT2RPNTLVWZ0RS:
38271 case X86::PT2RPNTLVWZ0RST1:
38272 case X86::PT2RPNTLVWZ1RS:
38273 case X86::PT2RPNTLVWZ1RST1: {
38274 const DebugLoc &DL = MI.getDebugLoc();
38275 unsigned Opc;
38276#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38277 switch (MI.getOpcode()) {
38278 default:
38279 llvm_unreachable("Unexpected instruction!");
38280 case X86::PT2RPNTLVWZ0:
38281 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
38282 break;
38283 case X86::PT2RPNTLVWZ0T1:
38284 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
38285 break;
38286 case X86::PT2RPNTLVWZ1:
38287 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
38288 break;
38289 case X86::PT2RPNTLVWZ1T1:
38290 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
38291 break;
38292 case X86::PT2RPNTLVWZ0RS:
38293 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
38294 break;
38295 case X86::PT2RPNTLVWZ0RST1:
38296 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
38297 break;
38298 case X86::PT2RPNTLVWZ1RS:
38299 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
38300 break;
38301 case X86::PT2RPNTLVWZ1RST1:
38302 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
38303 break;
38304 }
38305#undef GET_EGPR_IF_ENABLED
38306 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38307 MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
38308
38309 MIB.add(MI.getOperand(1)); // base
38310 MIB.add(MI.getOperand(2)); // scale
38311 MIB.add(MI.getOperand(3)); // index
38312 MIB.add(MI.getOperand(4)); // displacement
38313 MIB.add(MI.getOperand(5)); // segment
38314 MI.eraseFromParent(); // The pseudo is gone now.
38315 return BB;
38316 }
38317 case X86::PTTRANSPOSED:
38318 case X86::PTCONJTFP16: {
38319 const DebugLoc &DL = MI.getDebugLoc();
38320 unsigned Opc = MI.getOpcode() == X86::PTTRANSPOSED ? X86::TTRANSPOSED
38321 : X86::TCONJTFP16;
38322
38323 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38324 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38325 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38326
38327 MI.eraseFromParent(); // The pseudo is gone now.
38328 return BB;
38329 }
38330 case X86::PTCVTROWPS2BF16Hrri:
38331 case X86::PTCVTROWPS2BF16Lrri:
38332 case X86::PTCVTROWPS2PHHrri:
38333 case X86::PTCVTROWPS2PHLrri:
38334 case X86::PTCVTROWD2PSrri:
38335 case X86::PTILEMOVROWrri: {
38336 const DebugLoc &DL = MI.getDebugLoc();
38337 unsigned Opc;
38338 switch (MI.getOpcode()) {
38339 default:
38340 llvm_unreachable("Unexpected instruction!");
38341 case X86::PTCVTROWD2PSrri:
38342 Opc = X86::TCVTROWD2PSrri;
38343 break;
38344 case X86::PTCVTROWPS2BF16Hrri:
38345 Opc = X86::TCVTROWPS2BF16Hrri;
38346 break;
38347 case X86::PTCVTROWPS2PHHrri:
38348 Opc = X86::TCVTROWPS2PHHrri;
38349 break;
38350 case X86::PTCVTROWPS2BF16Lrri:
38351 Opc = X86::TCVTROWPS2BF16Lrri;
38352 break;
38353 case X86::PTCVTROWPS2PHLrri:
38354 Opc = X86::TCVTROWPS2PHLrri;
38355 break;
38356 case X86::PTILEMOVROWrri:
38357 Opc = X86::TILEMOVROWrri;
38358 break;
38359 }
38360 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38361 MIB.add(MI.getOperand(0));
38362 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38363 MIB.addImm(MI.getOperand(2).getImm());
38364
38365 MI.eraseFromParent(); // The pseudo is gone now.
38366 return BB;
38367 }
38368 case X86::PTCVTROWPS2BF16Hrre:
38369 case X86::PTCVTROWPS2BF16Lrre:
38370 case X86::PTCVTROWPS2PHHrre:
38371 case X86::PTCVTROWPS2PHLrre:
38372 case X86::PTCVTROWD2PSrre:
38373 case X86::PTILEMOVROWrre: {
38374 const DebugLoc &DL = MI.getDebugLoc();
38375 unsigned Opc;
38376 switch (MI.getOpcode()) {
38377 default:
38378 llvm_unreachable("Unexpected instruction!");
38379 case X86::PTCVTROWD2PSrre:
38380 Opc = X86::TCVTROWD2PSrre;
38381 break;
38382 case X86::PTCVTROWPS2BF16Hrre:
38383 Opc = X86::TCVTROWPS2BF16Hrre;
38384 break;
38385 case X86::PTCVTROWPS2BF16Lrre:
38386 Opc = X86::TCVTROWPS2BF16Lrre;
38387 break;
38388 case X86::PTCVTROWPS2PHHrre:
38389 Opc = X86::TCVTROWPS2PHHrre;
38390 break;
38391 case X86::PTCVTROWPS2PHLrre:
38392 Opc = X86::TCVTROWPS2PHLrre;
38393 break;
38394 case X86::PTILEMOVROWrre:
38395 Opc = X86::TILEMOVROWrre;
38396 break;
38397 }
38398 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38399 MIB.add(MI.getOperand(0));
38400 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38401 MIB.add(MI.getOperand(2));
38402
38403 MI.eraseFromParent(); // The pseudo is gone now.
38404 return BB;
38405 }
38406 }
38407}
38408
38409//===----------------------------------------------------------------------===//
38410// X86 Optimization Hooks
38411//===----------------------------------------------------------------------===//
38412
38413bool
38415 const APInt &DemandedBits,
38416 const APInt &DemandedElts,
38417 TargetLoweringOpt &TLO) const {
38418 EVT VT = Op.getValueType();
38419 unsigned Opcode = Op.getOpcode();
38420 unsigned EltSize = VT.getScalarSizeInBits();
38421
38422 if (VT.isVector()) {
38423 // If the constant is only all signbits in the active bits, then we should
38424 // extend it to the entire constant to allow it act as a boolean constant
38425 // vector.
38426 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
38427 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
38428 return false;
38429 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
38430 if (!DemandedElts[i] || V.getOperand(i).isUndef())
38431 continue;
38432 const APInt &Val = V.getConstantOperandAPInt(i);
38433 if (Val.getBitWidth() > Val.getNumSignBits() &&
38434 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
38435 return true;
38436 }
38437 return false;
38438 };
38439 // For vectors - if we have a constant, then try to sign extend.
38440 // TODO: Handle AND cases.
38441 unsigned ActiveBits = DemandedBits.getActiveBits();
38442 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
38443 (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
38444 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
38445 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
38446 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
38448 SDValue NewC =
38450 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
38451 SDValue NewOp =
38452 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
38453 return TLO.CombineTo(Op, NewOp);
38454 }
38455 return false;
38456 }
38457
38458 // Only optimize Ands to prevent shrinking a constant that could be
38459 // matched by movzx.
38460 if (Opcode != ISD::AND)
38461 return false;
38462
38463 // Make sure the RHS really is a constant.
38464 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
38465 if (!C)
38466 return false;
38467
38468 const APInt &Mask = C->getAPIntValue();
38469
38470 // Clear all non-demanded bits initially.
38471 APInt ShrunkMask = Mask & DemandedBits;
38472
38473 // Find the width of the shrunk mask.
38474 unsigned Width = ShrunkMask.getActiveBits();
38475
38476 // If the mask is all 0s there's nothing to do here.
38477 if (Width == 0)
38478 return false;
38479
38480 // Find the next power of 2 width, rounding up to a byte.
38481 Width = llvm::bit_ceil(std::max(Width, 8U));
38482 // Truncate the width to size to handle illegal types.
38483 Width = std::min(Width, EltSize);
38484
38485 // Calculate a possible zero extend mask for this constant.
38486 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
38487
38488 // If we aren't changing the mask, just return true to keep it and prevent
38489 // the caller from optimizing.
38490 if (ZeroExtendMask == Mask)
38491 return true;
38492
38493 // Make sure the new mask can be represented by a combination of mask bits
38494 // and non-demanded bits.
38495 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
38496 return false;
38497
38498 // Replace the constant with the zero extend mask.
38499 SDLoc DL(Op);
38500 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
38501 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
38502 return TLO.CombineTo(Op, NewOp);
38503}
38504
38506 KnownBits &Known,
38507 const APInt &DemandedElts,
38508 const SelectionDAG &DAG, unsigned Depth) {
38509 KnownBits Known2;
38510 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38511 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38512 Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
38513 Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
38514 Known = KnownBits::abdu(Known, Known2).zext(16);
38515 // Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
38516 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38517 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38518 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38519 Known = Known.zext(64);
38520}
38521
38523 KnownBits &Known,
38524 const APInt &DemandedElts,
38525 const SelectionDAG &DAG,
38526 unsigned Depth) {
38527 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38528
38529 // Multiply signed i16 elements to create i32 values and add Lo/Hi pairs.
38530 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38531 APInt DemandedLoElts =
38532 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38533 APInt DemandedHiElts =
38534 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38535 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38536 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38537 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38538 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38539 KnownBits Lo = KnownBits::mul(LHSLo.sext(32), RHSLo.sext(32));
38540 KnownBits Hi = KnownBits::mul(LHSHi.sext(32), RHSHi.sext(32));
38541 Known = KnownBits::add(Lo, Hi, /*NSW=*/false, /*NUW=*/false);
38542}
38543
38545 KnownBits &Known,
38546 const APInt &DemandedElts,
38547 const SelectionDAG &DAG,
38548 unsigned Depth) {
38549 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38550
38551 // Multiply unsigned/signed i8 elements to create i16 values and add_sat Lo/Hi
38552 // pairs.
38553 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38554 APInt DemandedLoElts =
38555 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38556 APInt DemandedHiElts =
38557 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38558 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38559 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38560 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38561 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38562 KnownBits Lo = KnownBits::mul(LHSLo.zext(16), RHSLo.sext(16));
38563 KnownBits Hi = KnownBits::mul(LHSHi.zext(16), RHSHi.sext(16));
38564 Known = KnownBits::sadd_sat(Lo, Hi);
38565}
38566
38568 const SDValue Op, const APInt &DemandedElts, unsigned Depth,
38569 const SelectionDAG &DAG,
38570 const function_ref<KnownBits(const KnownBits &, const KnownBits &)>
38571 KnownBitsFunc) {
38572 APInt DemandedEltsLHS, DemandedEltsRHS;
38573 getHorizDemandedEltsForFirstOperand(Op.getValueType().getSizeInBits(),
38574 DemandedElts, DemandedEltsLHS,
38575 DemandedEltsRHS);
38576
38577 const auto ComputeForSingleOpFunc =
38578 [&DAG, Depth, KnownBitsFunc](SDValue Op, APInt &DemandedEltsOp) {
38579 return KnownBitsFunc(
38580 DAG.computeKnownBits(Op, DemandedEltsOp, Depth + 1),
38581 DAG.computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1));
38582 };
38583
38584 if (DemandedEltsRHS.isZero())
38585 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS);
38586 if (DemandedEltsLHS.isZero())
38587 return ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS);
38588
38589 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS)
38590 .intersectWith(ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS));
38591}
38592
38594 KnownBits &Known,
38595 const APInt &DemandedElts,
38596 const SelectionDAG &DAG,
38597 unsigned Depth) const {
38598 unsigned BitWidth = Known.getBitWidth();
38599 unsigned NumElts = DemandedElts.getBitWidth();
38600 unsigned Opc = Op.getOpcode();
38601 EVT VT = Op.getValueType();
38606 "Should use MaskedValueIsZero if you don't know whether Op"
38607 " is a target node!");
38608
38609 Known.resetAll();
38610 switch (Opc) {
38611 default: break;
38612 case X86ISD::MUL_IMM: {
38613 KnownBits Known2;
38614 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38615 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38616 Known = KnownBits::mul(Known, Known2);
38617 break;
38618 }
38619 case X86ISD::BSF: {
38621
38622 KnownBits Known2;
38623 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38624 if (Known2.isNonZero()) {
38625 // If we have a known 1, its position is our upper bound.
38626 unsigned PossibleTZ = Known2.countMaxTrailingZeros();
38627 unsigned LowBits = llvm::bit_width(PossibleTZ);
38628 Known.Zero.setBitsFrom(LowBits);
38629 } else if (!Op.getOperand(0).isUndef()) {
38630 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38631 Known = Known.intersectWith(Known2);
38632 }
38633 break;
38634 }
38635 case X86ISD::BSR: {
38636 // TODO: Bound with input known bits?
38638
38639 if (!Op.getOperand(0).isUndef() &&
38640 !DAG.isKnownNeverZero(Op.getOperand(1), Depth + 1)) {
38641 KnownBits Known2;
38642 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38643 Known = Known.intersectWith(Known2);
38644 }
38645 break;
38646 }
38647 case X86ISD::SETCC:
38648 Known.Zero.setBitsFrom(1);
38649 break;
38650 case X86ISD::MOVMSK: {
38651 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
38652 Known.Zero.setBitsFrom(NumLoBits);
38653 break;
38654 }
38655 case X86ISD::PEXTRB:
38656 case X86ISD::PEXTRW: {
38657 SDValue Src = Op.getOperand(0);
38658 EVT SrcVT = Src.getValueType();
38659 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
38660 Op.getConstantOperandVal(1));
38661 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
38662 Known = Known.anyextOrTrunc(BitWidth);
38663 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
38664 break;
38665 }
38666 case X86ISD::VSRAI:
38667 case X86ISD::VSHLI:
38668 case X86ISD::VSRLI: {
38669 unsigned ShAmt = Op.getConstantOperandVal(1);
38670 if (ShAmt >= VT.getScalarSizeInBits()) {
38671 // Out of range logical bit shifts are guaranteed to be zero.
38672 // Out of range arithmetic bit shifts splat the sign bit.
38673 if (Opc != X86ISD::VSRAI) {
38674 Known.setAllZero();
38675 break;
38676 }
38677
38678 ShAmt = VT.getScalarSizeInBits() - 1;
38679 }
38680
38681 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38682 if (Opc == X86ISD::VSHLI) {
38683 Known <<= ShAmt;
38684 // Low bits are known zero.
38685 Known.Zero.setLowBits(ShAmt);
38686 } else if (Opc == X86ISD::VSRLI) {
38687 Known >>= ShAmt;
38688 // High bits are known zero.
38689 Known.Zero.setHighBits(ShAmt);
38690 } else {
38691 Known.Zero.ashrInPlace(ShAmt);
38692 Known.One.ashrInPlace(ShAmt);
38693 }
38694 break;
38695 }
38696 case X86ISD::PACKUS: {
38697 // PACKUS is just a truncation if the upper half is zero.
38698 APInt DemandedLHS, DemandedRHS;
38699 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38700
38701 Known.One = APInt::getAllOnes(BitWidth * 2);
38702 Known.Zero = APInt::getAllOnes(BitWidth * 2);
38703
38704 KnownBits Known2;
38705 if (!!DemandedLHS) {
38706 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38707 Known = Known.intersectWith(Known2);
38708 }
38709 if (!!DemandedRHS) {
38710 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38711 Known = Known.intersectWith(Known2);
38712 }
38713
38714 if (Known.countMinLeadingZeros() < BitWidth)
38715 Known.resetAll();
38716 Known = Known.trunc(BitWidth);
38717 break;
38718 }
38719 case X86ISD::PSHUFB: {
38720 SDValue Src = Op.getOperand(0);
38721 SDValue Idx = Op.getOperand(1);
38722
38723 // If the index vector is never negative (MSB is zero), then all elements
38724 // come from the source vector. This is useful for cases where
38725 // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
38726 // below will handle the more common constant shuffle mask case.
38727 KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);
38728 if (KnownIdx.isNonNegative())
38729 Known = DAG.computeKnownBits(Src, Depth + 1);
38730 break;
38731 }
38732 case X86ISD::VBROADCAST: {
38733 SDValue Src = Op.getOperand(0);
38734 if (!Src.getSimpleValueType().isVector()) {
38735 Known = DAG.computeKnownBits(Src, Depth + 1);
38736 return;
38737 }
38738 break;
38739 }
38740 case X86ISD::AND: {
38741 if (Op.getResNo() == 0) {
38742 KnownBits Known2;
38743 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38744 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38745 Known &= Known2;
38746 }
38747 break;
38748 }
38749 case X86ISD::ANDNP: {
38750 KnownBits Known2;
38751 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38752 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38753
38754 // ANDNP = (~X & Y);
38755 Known.One &= Known2.Zero;
38756 Known.Zero |= Known2.One;
38757 break;
38758 }
38759 case X86ISD::FOR: {
38760 KnownBits Known2;
38761 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38762 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38763
38764 Known |= Known2;
38765 break;
38766 }
38767 case X86ISD::PSADBW: {
38768 SDValue LHS = Op.getOperand(0);
38769 SDValue RHS = Op.getOperand(1);
38770 assert(VT.getScalarType() == MVT::i64 &&
38771 LHS.getValueType() == RHS.getValueType() &&
38772 LHS.getValueType().getScalarType() == MVT::i8 &&
38773 "Unexpected PSADBW types");
38774 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38775 break;
38776 }
38777 case X86ISD::PCMPGT:
38778 case X86ISD::PCMPEQ: {
38779 KnownBits KnownLhs =
38780 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38781 KnownBits KnownRhs =
38782 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38783 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
38784 ? KnownBits::eq(KnownLhs, KnownRhs)
38785 : KnownBits::sgt(KnownLhs, KnownRhs);
38786 if (Res) {
38787 if (*Res)
38788 Known.setAllOnes();
38789 else
38790 Known.setAllZero();
38791 }
38792 break;
38793 }
38794 case X86ISD::VPMADDWD: {
38795 SDValue LHS = Op.getOperand(0);
38796 SDValue RHS = Op.getOperand(1);
38797 assert(VT.getVectorElementType() == MVT::i32 &&
38798 LHS.getValueType() == RHS.getValueType() &&
38799 LHS.getValueType().getVectorElementType() == MVT::i16 &&
38800 "Unexpected PMADDWD types");
38801 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38802 break;
38803 }
38804 case X86ISD::VPMADDUBSW: {
38805 SDValue LHS = Op.getOperand(0);
38806 SDValue RHS = Op.getOperand(1);
38807 assert(VT.getVectorElementType() == MVT::i16 &&
38808 LHS.getValueType() == RHS.getValueType() &&
38809 LHS.getValueType().getVectorElementType() == MVT::i8 &&
38810 "Unexpected PMADDUBSW types");
38811 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38812 break;
38813 }
38814 case X86ISD::PMULUDQ: {
38815 KnownBits Known2;
38816 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38817 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38818
38819 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
38820 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
38821 Known = KnownBits::mul(Known, Known2);
38822 break;
38823 }
38824 case X86ISD::CMOV: {
38825 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
38826 // If we don't know any bits, early out.
38827 if (Known.isUnknown())
38828 break;
38829 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
38830
38831 // Only known if known in both the LHS and RHS.
38832 Known = Known.intersectWith(Known2);
38833 break;
38834 }
38835 case X86ISD::BEXTR:
38836 case X86ISD::BEXTRI: {
38837 SDValue Op0 = Op.getOperand(0);
38838 SDValue Op1 = Op.getOperand(1);
38839
38840 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
38841 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
38842 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
38843
38844 // If the length is 0, the result is 0.
38845 if (Length == 0) {
38846 Known.setAllZero();
38847 break;
38848 }
38849
38850 if ((Shift + Length) <= BitWidth) {
38851 Known = DAG.computeKnownBits(Op0, Depth + 1);
38852 Known = Known.extractBits(Length, Shift);
38853 Known = Known.zextOrTrunc(BitWidth);
38854 }
38855 }
38856 break;
38857 }
38858 case X86ISD::PDEP: {
38859 KnownBits Known2;
38860 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38861 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38862 // Zeros are retained from the mask operand. But not ones.
38863 Known.One.clearAllBits();
38864 // The result will have at least as many trailing zeros as the non-mask
38865 // operand since bits can only map to the same or higher bit position.
38866 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
38867 break;
38868 }
38869 case X86ISD::PEXT: {
38870 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38871 // The result has as many leading zeros as the number of zeroes in the mask.
38872 unsigned Count = Known.Zero.popcount();
38874 Known.One.clearAllBits();
38875 break;
38876 }
38877 case X86ISD::VTRUNC:
38878 case X86ISD::VTRUNCS:
38879 case X86ISD::VTRUNCUS:
38880 case X86ISD::CVTSI2P:
38881 case X86ISD::CVTUI2P:
38882 case X86ISD::CVTP2SI:
38883 case X86ISD::CVTP2UI:
38884 case X86ISD::MCVTP2SI:
38885 case X86ISD::MCVTP2UI:
38886 case X86ISD::CVTTP2SI:
38887 case X86ISD::CVTTP2UI:
38888 case X86ISD::MCVTTP2SI:
38889 case X86ISD::MCVTTP2UI:
38890 case X86ISD::MCVTSI2P:
38891 case X86ISD::MCVTUI2P:
38892 case X86ISD::VFPROUND:
38893 case X86ISD::VMFPROUND:
38894 case X86ISD::CVTPS2PH:
38895 case X86ISD::MCVTPS2PH:
38896 case X86ISD::MCVTTP2SIS:
38897 case X86ISD::MCVTTP2UIS: {
38898 // Truncations/Conversions - upper elements are known zero.
38899 EVT SrcVT = Op.getOperand(0).getValueType();
38900 if (SrcVT.isVector()) {
38901 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38902 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38903 Known.setAllZero();
38904 }
38905 break;
38906 }
38913 // Strict Conversions - upper elements are known zero.
38914 EVT SrcVT = Op.getOperand(1).getValueType();
38915 if (SrcVT.isVector()) {
38916 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38917 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38918 Known.setAllZero();
38919 }
38920 break;
38921 }
38922 case X86ISD::MOVQ2DQ: {
38923 // Move from MMX to XMM. Upper half of XMM should be 0.
38924 if (DemandedElts.countr_zero() >= (NumElts / 2))
38925 Known.setAllZero();
38926 break;
38927 }
38929 APInt UndefElts;
38930 SmallVector<APInt, 16> EltBits;
38931 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
38932 /*AllowWholeUndefs*/ false,
38933 /*AllowPartialUndefs*/ false)) {
38934 Known.Zero.setAllBits();
38935 Known.One.setAllBits();
38936 for (unsigned I = 0; I != NumElts; ++I) {
38937 if (!DemandedElts[I])
38938 continue;
38939 if (UndefElts[I]) {
38940 Known.resetAll();
38941 break;
38942 }
38943 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
38944 Known = Known.intersectWith(Known2);
38945 }
38946 return;
38947 }
38948 break;
38949 }
38950 case X86ISD::HADD:
38951 case X86ISD::HSUB: {
38953 Op, DemandedElts, Depth, DAG,
38954 [Opc](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
38956 /*Add=*/Opc == X86ISD::HADD, /*NSW=*/false, /*NUW=*/false,
38957 KnownLHS, KnownRHS);
38958 });
38959 break;
38960 }
38962 switch (Op->getConstantOperandVal(0)) {
38963 case Intrinsic::x86_sse2_pmadd_wd:
38964 case Intrinsic::x86_avx2_pmadd_wd:
38965 case Intrinsic::x86_avx512_pmaddw_d_512: {
38966 SDValue LHS = Op.getOperand(1);
38967 SDValue RHS = Op.getOperand(2);
38968 assert(VT.getScalarType() == MVT::i32 &&
38969 LHS.getValueType() == RHS.getValueType() &&
38970 LHS.getValueType().getScalarType() == MVT::i16 &&
38971 "Unexpected PMADDWD types");
38972 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38973 break;
38974 }
38975 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
38976 case Intrinsic::x86_avx2_pmadd_ub_sw:
38977 case Intrinsic::x86_avx512_pmaddubs_w_512: {
38978 SDValue LHS = Op.getOperand(1);
38979 SDValue RHS = Op.getOperand(2);
38980 assert(VT.getScalarType() == MVT::i16 &&
38981 LHS.getValueType() == RHS.getValueType() &&
38982 LHS.getValueType().getScalarType() == MVT::i8 &&
38983 "Unexpected PMADDUBSW types");
38984 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38985 break;
38986 }
38987 case Intrinsic::x86_sse2_psad_bw:
38988 case Intrinsic::x86_avx2_psad_bw:
38989 case Intrinsic::x86_avx512_psad_bw_512: {
38990 SDValue LHS = Op.getOperand(1);
38991 SDValue RHS = Op.getOperand(2);
38992 assert(VT.getScalarType() == MVT::i64 &&
38993 LHS.getValueType() == RHS.getValueType() &&
38994 LHS.getValueType().getScalarType() == MVT::i8 &&
38995 "Unexpected PSADBW types");
38996 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38997 break;
38998 }
38999 }
39000 break;
39001 }
39002 }
39003
39004 // Handle target shuffles.
39005 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39006 if (isTargetShuffle(Opc)) {
39009 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39010 unsigned NumOps = Ops.size();
39011 unsigned NumElts = VT.getVectorNumElements();
39012 if (Mask.size() == NumElts) {
39013 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39014 Known.Zero.setAllBits(); Known.One.setAllBits();
39015 for (unsigned i = 0; i != NumElts; ++i) {
39016 if (!DemandedElts[i])
39017 continue;
39018 int M = Mask[i];
39019 if (M == SM_SentinelUndef) {
39020 // For UNDEF elements, we don't know anything about the common state
39021 // of the shuffle result.
39022 Known.resetAll();
39023 break;
39024 }
39025 if (M == SM_SentinelZero) {
39026 Known.One.clearAllBits();
39027 continue;
39028 }
39029 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39030 "Shuffle index out of range");
39031
39032 unsigned OpIdx = (unsigned)M / NumElts;
39033 unsigned EltIdx = (unsigned)M % NumElts;
39034 if (Ops[OpIdx].getValueType() != VT) {
39035 // TODO - handle target shuffle ops with different value types.
39036 Known.resetAll();
39037 break;
39038 }
39039 DemandedOps[OpIdx].setBit(EltIdx);
39040 }
39041 // Known bits are the values that are shared by every demanded element.
39042 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
39043 if (!DemandedOps[i])
39044 continue;
39045 KnownBits Known2 =
39046 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
39047 Known = Known.intersectWith(Known2);
39048 }
39049 }
39050 }
39051 }
39052}
39053
39055 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
39056 unsigned Depth) const {
39057 EVT VT = Op.getValueType();
39058 unsigned VTBits = VT.getScalarSizeInBits();
39059 unsigned Opcode = Op.getOpcode();
39060 switch (Opcode) {
39062 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
39063 return VTBits;
39064
39065 case X86ISD::VTRUNC: {
39066 SDValue Src = Op.getOperand(0);
39067 MVT SrcVT = Src.getSimpleValueType();
39068 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
39069 assert(VTBits < NumSrcBits && "Illegal truncation input type");
39070 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
39071 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
39072 if (Tmp > (NumSrcBits - VTBits))
39073 return Tmp - (NumSrcBits - VTBits);
39074 return 1;
39075 }
39076
39077 case X86ISD::PACKSS: {
39078 // PACKSS is just a truncation if the sign bits extend to the packed size.
39079 APInt DemandedLHS, DemandedRHS;
39080 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
39081 DemandedRHS);
39082
39083 // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
39084 // patterns often used to compact vXi64 allsignbit patterns.
39085 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
39087 if (BC.getOpcode() == X86ISD::PACKSS &&
39088 BC.getScalarValueSizeInBits() == 16 &&
39089 V.getScalarValueSizeInBits() == 32) {
39092 if (BC0.getScalarValueSizeInBits() == 64 &&
39093 BC1.getScalarValueSizeInBits() == 64 &&
39094 DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
39095 DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
39096 return 32;
39097 }
39098 return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
39099 };
39100
39101 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
39102 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
39103 if (!!DemandedLHS)
39104 Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
39105 if (!!DemandedRHS)
39106 Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
39107 unsigned Tmp = std::min(Tmp0, Tmp1);
39108 if (Tmp > (SrcBits - VTBits))
39109 return Tmp - (SrcBits - VTBits);
39110 return 1;
39111 }
39112
39113 case X86ISD::VBROADCAST: {
39114 SDValue Src = Op.getOperand(0);
39115 if (!Src.getSimpleValueType().isVector())
39116 return DAG.ComputeNumSignBits(Src, Depth + 1);
39117 break;
39118 }
39119
39120 case X86ISD::VSHLI: {
39121 SDValue Src = Op.getOperand(0);
39122 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
39123 if (ShiftVal.uge(VTBits))
39124 return VTBits; // Shifted all bits out --> zero.
39125 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39126 if (ShiftVal.uge(Tmp))
39127 return 1; // Shifted all sign bits out --> unknown.
39128 return Tmp - ShiftVal.getZExtValue();
39129 }
39130
39131 case X86ISD::VSRAI: {
39132 SDValue Src = Op.getOperand(0);
39133 APInt ShiftVal = Op.getConstantOperandAPInt(1);
39134 if (ShiftVal.uge(VTBits - 1))
39135 return VTBits; // Sign splat.
39136 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39137 ShiftVal += Tmp;
39138 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
39139 }
39140
39141 case X86ISD::FSETCC:
39142 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
39143 if (VT == MVT::f32 || VT == MVT::f64 ||
39144 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
39145 return VTBits;
39146 break;
39147
39148 case X86ISD::PCMPGT:
39149 case X86ISD::PCMPEQ:
39150 case X86ISD::CMPP:
39151 case X86ISD::VPCOM:
39152 case X86ISD::VPCOMU:
39153 // Vector compares return zero/all-bits result values.
39154 return VTBits;
39155
39156 case X86ISD::ANDNP: {
39157 unsigned Tmp0 =
39158 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
39159 if (Tmp0 == 1) return 1; // Early out.
39160 unsigned Tmp1 =
39161 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
39162 return std::min(Tmp0, Tmp1);
39163 }
39164
39165 case X86ISD::CMOV: {
39166 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
39167 if (Tmp0 == 1) return 1; // Early out.
39168 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
39169 return std::min(Tmp0, Tmp1);
39170 }
39171 }
39172
39173 // Handle target shuffles.
39174 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39175 if (isTargetShuffle(Opcode)) {
39178 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39179 unsigned NumOps = Ops.size();
39180 unsigned NumElts = VT.getVectorNumElements();
39181 if (Mask.size() == NumElts) {
39182 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39183 for (unsigned i = 0; i != NumElts; ++i) {
39184 if (!DemandedElts[i])
39185 continue;
39186 int M = Mask[i];
39187 if (M == SM_SentinelUndef) {
39188 // For UNDEF elements, we don't know anything about the common state
39189 // of the shuffle result.
39190 return 1;
39191 } else if (M == SM_SentinelZero) {
39192 // Zero = all sign bits.
39193 continue;
39194 }
39195 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39196 "Shuffle index out of range");
39197
39198 unsigned OpIdx = (unsigned)M / NumElts;
39199 unsigned EltIdx = (unsigned)M % NumElts;
39200 if (Ops[OpIdx].getValueType() != VT) {
39201 // TODO - handle target shuffle ops with different value types.
39202 return 1;
39203 }
39204 DemandedOps[OpIdx].setBit(EltIdx);
39205 }
39206 unsigned Tmp0 = VTBits;
39207 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
39208 if (!DemandedOps[i])
39209 continue;
39210 unsigned Tmp1 =
39211 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
39212 Tmp0 = std::min(Tmp0, Tmp1);
39213 }
39214 return Tmp0;
39215 }
39216 }
39217 }
39218
39219 // Fallback case.
39220 return 1;
39221}
39222
39224 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
39225 return N->getOperand(0);
39226 return N;
39227}
39228
39229// Helper to look for a normal load that can be narrowed into a vzload with the
39230// specified VT and memory VT. Returns SDValue() on failure.
39232 SelectionDAG &DAG) {
39233 // Can't if the load is volatile or atomic.
39234 if (!LN->isSimple())
39235 return SDValue();
39236
39237 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39238 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39239 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
39240 LN->getPointerInfo(), LN->getBaseAlign(),
39241 LN->getMemOperand()->getFlags());
39242}
39243
39244// Attempt to match a combined shuffle mask against supported unary shuffle
39245// instructions.
39246// TODO: Investigate sharing more of this with shuffle lowering.
39247static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39248 bool AllowFloatDomain, bool AllowIntDomain,
39249 SDValue V1, const SelectionDAG &DAG,
39250 const X86Subtarget &Subtarget, unsigned &Shuffle,
39251 MVT &SrcVT, MVT &DstVT) {
39252 unsigned NumMaskElts = Mask.size();
39253 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
39254
39255 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
39256 if (Mask[0] == 0 &&
39257 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
39258 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
39260 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
39261 Shuffle = X86ISD::VZEXT_MOVL;
39262 if (MaskEltSize == 16)
39263 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39264 else
39265 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39266 return true;
39267 }
39268 }
39269
39270 // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
39271 if (AllowIntDomain &&
39272 ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
39273 (MaskVT.is256BitVector() && Subtarget.hasInt256()) ||
39274 (MaskVT.is512BitVector() && Subtarget.useAVX512Regs()))) {
39275 unsigned MaxScale = 64 / MaskEltSize;
39276 bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
39277 DAG.ComputeNumSignBits(V1) == MaskEltSize;
39278 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
39279 // Skip 512-bit VPMOV?XBW on non-AVX512BW targets.
39280 if (Scale == 2 && MaskVT == MVT::v64i8 && !Subtarget.useBWIRegs())
39281 continue;
39282 bool MatchAny = true;
39283 bool MatchZero = true;
39284 bool MatchSign = UseSign;
39285 unsigned NumDstElts = NumMaskElts / Scale;
39286 for (unsigned i = 0;
39287 i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
39288 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
39289 MatchAny = MatchSign = MatchZero = false;
39290 break;
39291 }
39292 unsigned Pos = (i * Scale) + 1;
39293 unsigned Len = Scale - 1;
39294 MatchAny &= isUndefInRange(Mask, Pos, Len);
39295 MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
39296 MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
39297 }
39298 if (MatchAny || MatchSign || MatchZero) {
39299 assert((MatchSign || MatchZero) &&
39300 "Failed to match sext/zext but matched aext?");
39301 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
39302 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
39303 : MVT::getIntegerVT(MaskEltSize);
39304 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
39305
39306 Shuffle = unsigned(
39307 MatchAny ? ISD::ANY_EXTEND
39308 : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
39309 if (SrcVT.getVectorNumElements() != NumDstElts)
39310 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
39311
39312 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
39313 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
39314 return true;
39315 }
39316 }
39317 }
39318
39319 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
39320 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
39321 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
39322 isUndefOrEqual(Mask[0], 0) &&
39323 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
39324 Shuffle = X86ISD::VZEXT_MOVL;
39325 if (MaskEltSize == 16)
39326 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39327 else
39328 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39329 return true;
39330 }
39331
39332 // Check if we have SSE3 which will let us use MOVDDUP etc. The
39333 // instructions are no slower than UNPCKLPD but has the option to
39334 // fold the input operand into even an unaligned memory load.
39335 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
39336 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
39337 Shuffle = X86ISD::MOVDDUP;
39338 SrcVT = DstVT = MVT::v2f64;
39339 return true;
39340 }
39341 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39342 Shuffle = X86ISD::MOVSLDUP;
39343 SrcVT = DstVT = MVT::v4f32;
39344 return true;
39345 }
39346 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
39347 Shuffle = X86ISD::MOVSHDUP;
39348 SrcVT = DstVT = MVT::v4f32;
39349 return true;
39350 }
39351 }
39352
39353 if (MaskVT.is256BitVector() && AllowFloatDomain) {
39354 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
39355 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39356 Shuffle = X86ISD::MOVDDUP;
39357 SrcVT = DstVT = MVT::v4f64;
39358 return true;
39359 }
39360 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39361 V1)) {
39362 Shuffle = X86ISD::MOVSLDUP;
39363 SrcVT = DstVT = MVT::v8f32;
39364 return true;
39365 }
39366 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
39367 V1)) {
39368 Shuffle = X86ISD::MOVSHDUP;
39369 SrcVT = DstVT = MVT::v8f32;
39370 return true;
39371 }
39372 }
39373
39374 if (MaskVT.is512BitVector() && AllowFloatDomain) {
39375 assert(Subtarget.hasAVX512() &&
39376 "AVX512 required for 512-bit vector shuffles");
39377 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39378 V1)) {
39379 Shuffle = X86ISD::MOVDDUP;
39380 SrcVT = DstVT = MVT::v8f64;
39381 return true;
39382 }
39384 MaskVT, Mask,
39385 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
39386 Shuffle = X86ISD::MOVSLDUP;
39387 SrcVT = DstVT = MVT::v16f32;
39388 return true;
39389 }
39391 MaskVT, Mask,
39392 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
39393 Shuffle = X86ISD::MOVSHDUP;
39394 SrcVT = DstVT = MVT::v16f32;
39395 return true;
39396 }
39397 }
39398
39399 return false;
39400}
39401
39402// Attempt to match a combined shuffle mask against supported unary immediate
39403// permute instructions.
39404// TODO: Investigate sharing more of this with shuffle lowering.
39406 const APInt &Zeroable,
39407 bool AllowFloatDomain, bool AllowIntDomain,
39408 const SelectionDAG &DAG,
39409 const X86Subtarget &Subtarget,
39410 unsigned &Shuffle, MVT &ShuffleVT,
39411 unsigned &PermuteImm) {
39412 unsigned NumMaskElts = Mask.size();
39413 unsigned InputSizeInBits = MaskVT.getSizeInBits();
39414 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
39415 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
39416 bool ContainsZeros = isAnyZero(Mask);
39417
39418 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
39419 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
39420 // Check for lane crossing permutes.
39421 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
39422 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
39423 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
39424 Shuffle = X86ISD::VPERMI;
39425 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
39426 PermuteImm = getV4X86ShuffleImm(Mask);
39427 return true;
39428 }
39429 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
39430 SmallVector<int, 4> RepeatedMask;
39431 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
39432 Shuffle = X86ISD::VPERMI;
39433 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
39434 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
39435 return true;
39436 }
39437 }
39438 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
39439 // VPERMILPD can permute with a non-repeating shuffle.
39440 Shuffle = X86ISD::VPERMILPI;
39441 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
39442 PermuteImm = 0;
39443 for (int i = 0, e = Mask.size(); i != e; ++i) {
39444 int M = Mask[i];
39445 if (M == SM_SentinelUndef)
39446 continue;
39447 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
39448 PermuteImm |= (M & 1) << i;
39449 }
39450 return true;
39451 }
39452 }
39453
39454 // We are checking for shuffle match or shift match. Loop twice so we can
39455 // order which we try and match first depending on target preference.
39456 for (unsigned Order = 0; Order < 2; ++Order) {
39457 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
39458 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
39459 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
39460 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
39461 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
39462 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
39463 SmallVector<int, 4> RepeatedMask;
39464 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39465 // Narrow the repeated mask to create 32-bit element permutes.
39466 SmallVector<int, 4> WordMask = RepeatedMask;
39467 if (MaskScalarSizeInBits == 64)
39468 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
39469
39470 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
39471 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
39472 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
39473 PermuteImm = getV4X86ShuffleImm(WordMask);
39474 return true;
39475 }
39476 }
39477
39478 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
39479 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
39480 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39481 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39482 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39483 SmallVector<int, 4> RepeatedMask;
39484 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39485 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
39486 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
39487
39488 // PSHUFLW: permute lower 4 elements only.
39489 if (isUndefOrInRange(LoMask, 0, 4) &&
39490 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
39491 Shuffle = X86ISD::PSHUFLW;
39492 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39493 PermuteImm = getV4X86ShuffleImm(LoMask);
39494 return true;
39495 }
39496
39497 // PSHUFHW: permute upper 4 elements only.
39498 if (isUndefOrInRange(HiMask, 4, 8) &&
39499 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
39500 // Offset the HiMask so that we can create the shuffle immediate.
39501 int OffsetHiMask[4];
39502 for (int i = 0; i != 4; ++i)
39503 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
39504
39505 Shuffle = X86ISD::PSHUFHW;
39506 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39507 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
39508 return true;
39509 }
39510 }
39511 }
39512 } else {
39513 // Attempt to match against bit rotates.
39514 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
39515 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
39516 Subtarget.hasAVX512())) {
39517 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
39518 Subtarget, Mask);
39519 if (0 < RotateAmt) {
39520 Shuffle = X86ISD::VROTLI;
39521 PermuteImm = (unsigned)RotateAmt;
39522 return true;
39523 }
39524 }
39525 }
39526 // Attempt to match against byte/bit shifts.
39527 if (AllowIntDomain &&
39528 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39529 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39530 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39531 int ShiftAmt =
39532 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
39533 Zeroable, Subtarget);
39534 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
39535 32 <= ShuffleVT.getScalarSizeInBits())) {
39536 // Byte shifts can be slower so only match them on second attempt.
39537 if (Order == 0 &&
39538 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
39539 continue;
39540
39541 PermuteImm = (unsigned)ShiftAmt;
39542 return true;
39543 }
39544
39545 }
39546 }
39547
39548 return false;
39549}
39550
39551// Attempt to match a combined unary shuffle mask against supported binary
39552// shuffle instructions.
39553// TODO: Investigate sharing more of this with shuffle lowering.
39554static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39555 bool AllowFloatDomain, bool AllowIntDomain,
39556 SDValue &V1, SDValue &V2, const SDLoc &DL,
39557 SelectionDAG &DAG, const X86Subtarget &Subtarget,
39558 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
39559 bool IsUnary) {
39560 unsigned NumMaskElts = Mask.size();
39561 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39562 unsigned SizeInBits = MaskVT.getSizeInBits();
39563
39564 if (MaskVT.is128BitVector()) {
39565 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
39566 AllowFloatDomain) {
39567 V2 = V1;
39568 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
39569 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
39570 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39571 return true;
39572 }
39573 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
39574 AllowFloatDomain) {
39575 V2 = V1;
39576 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
39577 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39578 return true;
39579 }
39580 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
39581 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
39582 std::swap(V1, V2);
39583 Shuffle = X86ISD::MOVSD;
39584 SrcVT = DstVT = MVT::v2f64;
39585 return true;
39586 }
39587 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
39588 (AllowFloatDomain || !Subtarget.hasSSE41())) {
39589 Shuffle = X86ISD::MOVSS;
39590 SrcVT = DstVT = MVT::v4f32;
39591 return true;
39592 }
39593 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
39594 DAG) &&
39595 Subtarget.hasFP16()) {
39596 Shuffle = X86ISD::MOVSH;
39597 SrcVT = DstVT = MVT::v8f16;
39598 return true;
39599 }
39600 }
39601
39602 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
39603 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
39604 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
39605 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
39606 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
39607 Subtarget)) {
39608 DstVT = MaskVT;
39609 return true;
39610 }
39611 }
39612 // TODO: Can we handle this inside matchShuffleWithPACK?
39613 if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&
39614 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
39615 V1.getScalarValueSizeInBits() == 64 &&
39616 V2.getScalarValueSizeInBits() == 64) {
39617 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
39618 unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
39619 unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
39620 if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
39621 SrcVT = MVT::v4i32;
39622 DstVT = MVT::v8i16;
39623 Shuffle = X86ISD::PACKUS;
39624 return true;
39625 }
39626 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
39627 if (MinLZV1 >= 56 && MinLZV2 >= 56) {
39628 SrcVT = MVT::v8i16;
39629 DstVT = MVT::v16i8;
39630 Shuffle = X86ISD::PACKUS;
39631 return true;
39632 }
39633 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
39634 if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
39635 SrcVT = MVT::v4i32;
39636 DstVT = MVT::v8i16;
39637 Shuffle = X86ISD::PACKSS;
39638 return true;
39639 }
39640 }
39641
39642 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
39643 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
39644 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39645 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
39646 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39647 (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
39648 (32 <= EltSizeInBits || Subtarget.hasBWI()))) {
39649 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
39650 Subtarget)) {
39651 SrcVT = DstVT = MaskVT;
39652 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
39653 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
39654 return true;
39655 }
39656 }
39657
39658 // Attempt to match against a OR if we're performing a blend shuffle and the
39659 // non-blended source element is zero in each case.
39660 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
39661 if (SizeInBits == V1.getValueSizeInBits() &&
39662 SizeInBits == V2.getValueSizeInBits() &&
39663 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39664 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
39665 bool IsBlend = true;
39666 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
39667 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
39668 unsigned Scale1 = NumV1Elts / NumMaskElts;
39669 unsigned Scale2 = NumV2Elts / NumMaskElts;
39670 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
39671 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
39672 for (unsigned i = 0; i != NumMaskElts; ++i) {
39673 int M = Mask[i];
39674 if (M == SM_SentinelUndef)
39675 continue;
39676 if (M == SM_SentinelZero) {
39677 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39678 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39679 continue;
39680 }
39681 if (M == (int)i) {
39682 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39683 continue;
39684 }
39685 if (M == (int)(i + NumMaskElts)) {
39686 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39687 continue;
39688 }
39689 IsBlend = false;
39690 break;
39691 }
39692 if (IsBlend) {
39693 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
39694 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
39695 Shuffle = ISD::OR;
39696 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39697 return true;
39698 }
39699 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
39700 // FIXME: handle mismatched sizes?
39701 // TODO: investigate if `ISD::OR` handling in
39702 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
39703 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
39704 unsigned NumElts = V.getValueType().getVectorNumElements();
39705 KnownBits Known(NumElts);
39706 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
39707 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
39708 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
39709 if (PeepholeKnown.isZero())
39710 Known.Zero.setBit(EltIdx);
39711 if (PeepholeKnown.isAllOnes())
39712 Known.One.setBit(EltIdx);
39713 }
39714 return Known;
39715 };
39716
39717 KnownBits V1Known = computeKnownBitsElementWise(V1);
39718 KnownBits V2Known = computeKnownBitsElementWise(V2);
39719
39720 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
39721 int M = Mask[i];
39722 if (M == SM_SentinelUndef)
39723 continue;
39724 if (M == SM_SentinelZero) {
39725 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
39726 continue;
39727 }
39728 if (M == (int)i) {
39729 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
39730 continue;
39731 }
39732 if (M == (int)(i + NumMaskElts)) {
39733 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
39734 continue;
39735 }
39736 llvm_unreachable("will not get here.");
39737 }
39738 if (IsBlend) {
39739 Shuffle = ISD::OR;
39740 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39741 return true;
39742 }
39743 }
39744 }
39745 }
39746
39747 return false;
39748}
39749
39751 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
39752 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
39753 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
39754 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
39755 unsigned NumMaskElts = Mask.size();
39756 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39757
39758 // Attempt to match against VALIGND/VALIGNQ rotate.
39759 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
39760 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
39761 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
39762 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39763 MVT AlignVT = MVT::getVectorVT(MVT::getIntegerVT(EltSizeInBits),
39764 MaskVT.getSizeInBits() / EltSizeInBits);
39765 if (!isAnyZero(Mask)) {
39766 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
39767 if (0 < Rotation) {
39768 Shuffle = X86ISD::VALIGN;
39769 ShuffleVT = AlignVT;
39770 PermuteImm = Rotation;
39771 return true;
39772 }
39773 }
39774 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
39775 unsigned ZeroLo = Zeroable.countr_one();
39776 unsigned ZeroHi = Zeroable.countl_one();
39777 assert((ZeroLo + ZeroHi) < NumMaskElts && "Zeroable shuffle detected");
39778 if (ZeroLo) {
39779 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39780 std::iota(ShiftMask.begin() + ZeroLo, ShiftMask.end(), 0);
39781 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39782 V2 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39783 Shuffle = X86ISD::VALIGN;
39784 ShuffleVT = AlignVT;
39785 PermuteImm = NumMaskElts - ZeroLo;
39786 return true;
39787 }
39788 }
39789 if (ZeroHi) {
39790 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39791 std::iota(ShiftMask.begin(), ShiftMask.begin() + NumMaskElts - ZeroHi,
39792 ZeroHi);
39793 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39794 V2 = V1;
39795 V1 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39796 Shuffle = X86ISD::VALIGN;
39797 ShuffleVT = AlignVT;
39798 PermuteImm = ZeroHi;
39799 return true;
39800 }
39801 }
39802 }
39803
39804 // Attempt to match against PALIGNR byte rotate.
39805 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
39806 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39807 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39808 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
39809 if (0 < ByteRotation) {
39810 Shuffle = X86ISD::PALIGNR;
39811 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
39812 PermuteImm = ByteRotation;
39813 return true;
39814 }
39815 }
39816
39817 // Attempt to combine to X86ISD::BLENDI.
39818 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
39819 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
39820 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
39821 uint64_t BlendMask = 0;
39822 bool ForceV1Zero = false, ForceV2Zero = false;
39823 SmallVector<int, 8> TargetMask(Mask);
39824 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
39825 ForceV2Zero, BlendMask)) {
39826 if (MaskVT == MVT::v16i16) {
39827 // We can only use v16i16 PBLENDW if the lanes are repeated.
39828 SmallVector<int, 8> RepeatedMask;
39829 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
39830 RepeatedMask)) {
39831 assert(RepeatedMask.size() == 8 &&
39832 "Repeated mask size doesn't match!");
39833 PermuteImm = 0;
39834 for (int i = 0; i < 8; ++i)
39835 if (RepeatedMask[i] >= 8)
39836 PermuteImm |= 1 << i;
39837 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39838 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39839 Shuffle = X86ISD::BLENDI;
39840 ShuffleVT = MaskVT;
39841 return true;
39842 }
39843 } else {
39844 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39845 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39846 PermuteImm = (unsigned)BlendMask;
39847 Shuffle = X86ISD::BLENDI;
39848 ShuffleVT = MaskVT;
39849 return true;
39850 }
39851 }
39852 }
39853
39854 // Attempt to combine to INSERTPS, but only if it has elements that need to
39855 // be set to zero.
39856 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39857 MaskVT.is128BitVector() && isAnyZero(Mask) &&
39858 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39859 Shuffle = X86ISD::INSERTPS;
39860 ShuffleVT = MVT::v4f32;
39861 return true;
39862 }
39863
39864 // Attempt to combine to SHUFPD.
39865 if (AllowFloatDomain && EltSizeInBits == 64 &&
39866 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39867 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39868 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39869 bool ForceV1Zero = false, ForceV2Zero = false;
39870 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
39871 PermuteImm, Mask, Zeroable)) {
39872 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39873 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39874 Shuffle = X86ISD::SHUFP;
39875 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
39876 return true;
39877 }
39878 }
39879
39880 // Attempt to combine to SHUFPS.
39881 if (AllowFloatDomain && EltSizeInBits == 32 &&
39882 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
39883 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39884 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39885 SmallVector<int, 4> RepeatedMask;
39886 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
39887 // Match each half of the repeated mask, to determine if its just
39888 // referencing one of the vectors, is zeroable or entirely undef.
39889 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
39890 int M0 = RepeatedMask[Offset];
39891 int M1 = RepeatedMask[Offset + 1];
39892
39893 if (isUndefInRange(RepeatedMask, Offset, 2)) {
39894 return DAG.getUNDEF(MaskVT);
39895 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
39896 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
39897 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
39898 return getZeroVector(MaskVT, Subtarget, DAG, DL);
39899 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
39900 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39901 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39902 return V1;
39903 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
39904 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39905 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39906 return V2;
39907 }
39908
39909 return SDValue();
39910 };
39911
39912 int ShufMask[4] = {-1, -1, -1, -1};
39913 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
39914 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
39915
39916 if (Lo && Hi) {
39917 V1 = Lo;
39918 V2 = Hi;
39919 Shuffle = X86ISD::SHUFP;
39920 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
39921 PermuteImm = getV4X86ShuffleImm(ShufMask);
39922 return true;
39923 }
39924 }
39925 }
39926
39927 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
39928 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39929 MaskVT.is128BitVector() &&
39930 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39931 Shuffle = X86ISD::INSERTPS;
39932 ShuffleVT = MVT::v4f32;
39933 return true;
39934 }
39935
39936 return false;
39937}
39938
39940 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
39941 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39942 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39943 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39944 const X86Subtarget &Subtarget);
39945
39946/// Combine an arbitrary chain of shuffles into a single instruction if
39947/// possible.
39948///
39949/// This is the leaf of the recursive combine below. When we have found some
39950/// chain of single-use x86 shuffle instructions and accumulated the combined
39951/// shuffle mask represented by them, this will try to pattern match that mask
39952/// into either a single instruction if there is a special purpose instruction
39953/// for this operation, or into a PSHUFB instruction which is a fully general
39954/// instruction but should only be used to replace chains over a certain depth.
39956 ArrayRef<SDValue> Inputs, unsigned RootOpc, MVT RootVT,
39957 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39958 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39959 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39960 const X86Subtarget &Subtarget) {
39961 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
39962 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
39963 "Unexpected number of shuffle inputs!");
39964 unsigned RootSizeInBits = RootVT.getSizeInBits();
39965 unsigned NumRootElts = RootVT.getVectorNumElements();
39966
39967 // Canonicalize shuffle input op to the requested type.
39968 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
39969 if (VT.getSizeInBits() > Op.getValueSizeInBits())
39970 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
39971 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
39972 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
39973 return DAG.getBitcast(VT, Op);
39974 };
39975
39976 // Find the inputs that enter the chain. Note that multiple uses are OK
39977 // here, we're not going to remove the operands we find.
39978 bool UnaryShuffle = (Inputs.size() == 1);
39979 SDValue V1 = peekThroughBitcasts(Inputs[0]);
39980 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
39981 : peekThroughBitcasts(Inputs[1]));
39982
39983 MVT VT1 = V1.getSimpleValueType();
39984 MVT VT2 = V2.getSimpleValueType();
39985 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
39986 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
39987
39988 SDValue Res;
39989
39990 unsigned NumBaseMaskElts = BaseMask.size();
39991 if (NumBaseMaskElts == 1) {
39992 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
39993 return CanonicalizeShuffleInput(RootVT, V1);
39994 }
39995
39996 bool OptForSize = DAG.shouldOptForSize();
39997 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
39998 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
39999 (RootVT.isFloatingPoint() && Depth >= 1) ||
40000 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
40001
40002 // If we are shuffling a splat (and not introducing zeros) then we can just
40003 // use it directly. This works for smaller elements as well as they already
40004 // repeat across each mask element.
40005 if (UnaryShuffle && !isAnyZero(BaseMask) &&
40006 V1.getValueSizeInBits() >= RootSizeInBits &&
40007 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
40008 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
40009 return CanonicalizeShuffleInput(RootVT, V1);
40010 }
40011
40012 SmallVector<int, 64> Mask(BaseMask);
40013
40014 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
40015 // etc. can be simplified.
40016 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
40017 SmallVector<int> ScaledMask, IdentityMask;
40018 unsigned NumElts = VT1.getVectorNumElements();
40019 if (Mask.size() <= NumElts &&
40020 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
40021 for (unsigned i = 0; i != NumElts; ++i)
40022 IdentityMask.push_back(i);
40023 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
40024 V2))
40025 return CanonicalizeShuffleInput(RootVT, V1);
40026 }
40027 }
40028
40029 // Handle 128/256-bit lane shuffles of 512-bit vectors.
40030 if (RootVT.is512BitVector() &&
40031 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
40032 // If the upper subvectors are zeroable, then an extract+insert is more
40033 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
40034 // to zero the upper subvectors.
40035 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
40036 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40037 return SDValue(); // Nothing to do!
40038 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
40039 "Unexpected lane shuffle");
40040 Res = CanonicalizeShuffleInput(RootVT, V1);
40041 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
40042 bool UseZero = isAnyZero(Mask);
40043 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
40044 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
40045 }
40046
40047 // Narrow shuffle mask to v4x128.
40048 SmallVector<int, 4> ScaledMask;
40049 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
40050 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
40051
40052 // Try to lower to vshuf64x2/vshuf32x4.
40053 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
40054 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
40055 SelectionDAG &DAG) {
40056 int PermMask[4] = {-1, -1, -1, -1};
40057 // Ensure elements came from the same Op.
40058 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
40059 for (int i = 0; i < 4; ++i) {
40060 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
40061 if (ScaledMask[i] < 0)
40062 continue;
40063
40064 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
40065 unsigned OpIndex = i / 2;
40066 if (Ops[OpIndex].isUndef())
40067 Ops[OpIndex] = Op;
40068 else if (Ops[OpIndex] != Op)
40069 return SDValue();
40070
40071 PermMask[i] = ScaledMask[i] % 4;
40072 }
40073
40074 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
40075 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
40076 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
40077 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
40078 };
40079
40080 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
40081 // doesn't work because our mask is for 128 bits and we don't have an MVT
40082 // to match that.
40083 bool PreferPERMQ = UnaryShuffle && !isFreeToSplitVector(V1, DAG) &&
40084 isUndefOrInRange(ScaledMask[0], 0, 2) &&
40085 isUndefOrInRange(ScaledMask[1], 0, 2) &&
40086 isUndefOrInRange(ScaledMask[2], 2, 4) &&
40087 isUndefOrInRange(ScaledMask[3], 2, 4) &&
40088 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
40089 ScaledMask[0] == (ScaledMask[2] % 2)) &&
40090 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
40091 ScaledMask[1] == (ScaledMask[3] % 2));
40092
40093 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
40094 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40095 return SDValue(); // Nothing to do!
40096 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
40097 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
40098 return DAG.getBitcast(RootVT, V);
40099 }
40100 }
40101
40102 // Handle 128-bit lane shuffles of 256-bit vectors.
40103 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
40104 // If the upper half is zeroable, then an extract+insert is more optimal
40105 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
40106 // zero the upper half.
40107 if (isUndefOrZero(Mask[1])) {
40108 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40109 return SDValue(); // Nothing to do!
40110 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
40111 Res = CanonicalizeShuffleInput(RootVT, V1);
40112 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
40113 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
40114 256);
40115 }
40116
40117 // If we're inserting the low subvector, an insert-subvector 'concat'
40118 // pattern is quicker than VPERM2X128.
40119 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
40120 !Subtarget.hasAVX2()) {
40121 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40122 return SDValue(); // Nothing to do!
40123 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
40124 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
40125 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
40126 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
40127 }
40128
40129 // Don't lower to VPERM2X128 here if we have AVX2+, prefer to use
40130 // VPERMQ/VPERMPD for unary shuffles unless we need to use the zeroing
40131 // feature.
40132 // Prefer blends for sequential shuffles unless we are optimizing for size.
40133 if (UnaryShuffle &&
40134 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
40135 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
40136 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40137 return SDValue(); // Nothing to do!
40138 unsigned PermMask = 0;
40139 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
40140 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
40141 return DAG.getNode(
40142 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
40143 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
40144 }
40145
40146 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40147 return SDValue(); // Nothing to do!
40148
40149 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
40150 if (!UnaryShuffle && !IsMaskedShuffle) {
40151 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
40152 "Unexpected shuffle sentinel value");
40153 // Prefer blends to X86ISD::VPERM2X128.
40154 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
40155 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40156 return SDValue(); // Nothing to do!
40157 unsigned PermMask = 0;
40158 PermMask |= ((Mask[0] & 3) << 0);
40159 PermMask |= ((Mask[1] & 3) << 4);
40160 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
40161 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
40162 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
40163 CanonicalizeShuffleInput(RootVT, LHS),
40164 CanonicalizeShuffleInput(RootVT, RHS),
40165 DAG.getTargetConstant(PermMask, DL, MVT::i8));
40166 }
40167 }
40168 }
40169
40170 // For masks that have been widened to 128-bit elements or more,
40171 // narrow back down to 64-bit elements.
40172 if (BaseMaskEltSizeInBits > 64) {
40173 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
40174 int MaskScale = BaseMaskEltSizeInBits / 64;
40175 SmallVector<int, 64> ScaledMask;
40176 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40177 Mask = std::move(ScaledMask);
40178 }
40179
40180 // For masked shuffles, we're trying to match the root width for better
40181 // writemask folding, attempt to scale the mask.
40182 // TODO - variable shuffles might need this to be widened again.
40183 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
40184 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
40185 int MaskScale = NumRootElts / Mask.size();
40186 SmallVector<int, 64> ScaledMask;
40187 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40188 Mask = std::move(ScaledMask);
40189 }
40190
40191 unsigned NumMaskElts = Mask.size();
40192 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
40193 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40194
40195 // Determine the effective mask value type.
40196 FloatDomain &= (32 <= MaskEltSizeInBits);
40197 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
40198 : MVT::getIntegerVT(MaskEltSizeInBits);
40199 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
40200
40201 // Only allow legal mask types.
40202 if (!TLI.isTypeLegal(MaskVT))
40203 return SDValue();
40204
40205 // Attempt to match the mask against known shuffle patterns.
40206 MVT ShuffleSrcVT, ShuffleVT;
40207 unsigned Shuffle, PermuteImm;
40208
40209 // Which shuffle domains are permitted?
40210 // Permit domain crossing at higher combine depths.
40211 // TODO: Should we indicate which domain is preferred if both are allowed?
40212 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
40213 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
40214 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
40215
40216 // Determine zeroable mask elements.
40217 APInt KnownUndef, KnownZero;
40218 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
40219 APInt Zeroable = KnownUndef | KnownZero;
40220
40221 if (UnaryShuffle) {
40222 // Attempt to match against broadcast-from-vector.
40223 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
40224 if ((Subtarget.hasAVX2() ||
40225 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
40226 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
40227 if (isUndefOrEqual(Mask, 0)) {
40228 if (V1.getValueType() == MaskVT &&
40230 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
40231 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40232 return SDValue(); // Nothing to do!
40233 Res = V1.getOperand(0);
40234 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40235 return DAG.getBitcast(RootVT, Res);
40236 }
40237 if (Subtarget.hasAVX2()) {
40238 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40239 return SDValue(); // Nothing to do!
40240 Res = CanonicalizeShuffleInput(MaskVT, V1);
40241 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40242 return DAG.getBitcast(RootVT, Res);
40243 }
40244 }
40245 }
40246
40247 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
40248 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
40249 (!IsMaskedShuffle ||
40250 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40251 if (Depth == 0 && RootOpc == Shuffle)
40252 return SDValue(); // Nothing to do!
40253 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40254 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
40255 return DAG.getBitcast(RootVT, Res);
40256 }
40257
40258 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40259 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
40260 PermuteImm) &&
40261 (!IsMaskedShuffle ||
40262 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40263 if (Depth == 0 && RootOpc == Shuffle)
40264 return SDValue(); // Nothing to do!
40265 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
40266 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
40267 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40268 return DAG.getBitcast(RootVT, Res);
40269 }
40270 }
40271
40272 // Attempt to combine to INSERTPS, but only if the inserted element has come
40273 // from a scalar.
40274 // TODO: Handle other insertions here as well?
40275 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
40276 Subtarget.hasSSE41() &&
40277 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
40278 if (MaskEltSizeInBits == 32) {
40279 SDValue SrcV1 = V1, SrcV2 = V2;
40280 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
40281 DAG) &&
40282 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
40283 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40284 return SDValue(); // Nothing to do!
40285 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40286 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
40287 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
40288 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40289 return DAG.getBitcast(RootVT, Res);
40290 }
40291 }
40292 if (MaskEltSizeInBits == 64 &&
40293 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
40295 V2.getScalarValueSizeInBits() <= 32) {
40296 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40297 return SDValue(); // Nothing to do!
40298 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
40299 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40300 CanonicalizeShuffleInput(MVT::v4f32, V1),
40301 CanonicalizeShuffleInput(MVT::v4f32, V2),
40302 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40303 return DAG.getBitcast(RootVT, Res);
40304 }
40305 }
40306
40307 SDValue NewV1 = V1; // Save operands in case early exit happens.
40308 SDValue NewV2 = V2;
40309 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
40310 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
40311 ShuffleVT, UnaryShuffle) &&
40312 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40313 if (Depth == 0 && RootOpc == Shuffle)
40314 return SDValue(); // Nothing to do!
40315 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
40316 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
40317 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
40318 return DAG.getBitcast(RootVT, Res);
40319 }
40320
40321 NewV1 = V1; // Save operands in case early exit happens.
40322 NewV2 = V2;
40323 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40324 AllowIntDomain, NewV1, NewV2, DL, DAG,
40325 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
40326 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40327 if (Depth == 0 && RootOpc == Shuffle)
40328 return SDValue(); // Nothing to do!
40329 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
40330 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
40331 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
40332 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40333 return DAG.getBitcast(RootVT, Res);
40334 }
40335
40336 // Typically from here on, we need an integer version of MaskVT.
40337 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
40338 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
40339
40340 // Annoyingly, SSE4A instructions don't map into the above match helpers.
40341 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
40342 uint64_t BitLen, BitIdx;
40343 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
40344 Zeroable)) {
40345 if (Depth == 0 && RootOpc == X86ISD::EXTRQI)
40346 return SDValue(); // Nothing to do!
40347 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40348 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
40349 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40350 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40351 return DAG.getBitcast(RootVT, Res);
40352 }
40353
40354 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
40355 if (Depth == 0 && RootOpc == X86ISD::INSERTQI)
40356 return SDValue(); // Nothing to do!
40357 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40358 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
40359 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
40360 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40361 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40362 return DAG.getBitcast(RootVT, Res);
40363 }
40364 }
40365
40366 // Match shuffle against TRUNCATE patterns.
40367 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
40368 // Match against a VTRUNC instruction, accounting for src/dst sizes.
40369 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
40370 Subtarget)) {
40371 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
40372 ShuffleSrcVT.getVectorNumElements();
40373 unsigned Opc =
40375 if (Depth == 0 && RootOpc == Opc)
40376 return SDValue(); // Nothing to do!
40377 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40378 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
40379 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
40380 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
40381 return DAG.getBitcast(RootVT, Res);
40382 }
40383
40384 // Do we need a more general binary truncation pattern?
40385 if (RootSizeInBits < 512 &&
40386 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
40387 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
40388 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
40389 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
40390 // Bail if this was already a truncation or PACK node.
40391 // We sometimes fail to match PACK if we demand known undef elements.
40392 if (Depth == 0 &&
40393 (RootOpc == ISD::TRUNCATE || RootOpc == X86ISD::PACKSS ||
40394 RootOpc == X86ISD::PACKUS))
40395 return SDValue(); // Nothing to do!
40396 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40397 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
40398 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40399 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
40400 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40401 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
40402 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
40403 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
40404 return DAG.getBitcast(RootVT, Res);
40405 }
40406 }
40407
40408 // Don't try to re-form single instruction chains under any circumstances now
40409 // that we've done encoding canonicalization for them.
40410 if (Depth < 1)
40411 return SDValue();
40412
40413 int NumVariableMasks = llvm::count_if(SrcNodes, [](const SDNode *N) {
40414 return isTargetShuffleVariableMask(N->getOpcode());
40415 });
40416 bool HasSlowVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
40417 return (N->getOpcode() == X86ISD::VPERMV3 ||
40418 N->getOpcode() == X86ISD::VPERMV);
40419 });
40420
40421 // Depth threshold above which we can efficiently use variable mask shuffles.
40422 int VariableCrossLaneShuffleDepth =
40423 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
40424 int VariablePerLaneShuffleDepth =
40425 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
40426 AllowVariableCrossLaneMask &=
40427 (Depth >= VariableCrossLaneShuffleDepth) || NumVariableMasks;
40428 AllowVariablePerLaneMask &=
40429 (Depth >= VariablePerLaneShuffleDepth) || NumVariableMasks;
40430 // VPERM2W/VPERM2B are 3 uops on Skylake and Icelake so we require a
40431 // higher depth before combining them.
40432 int BWIVPERMV3ShuffleDepth =
40433 VariableCrossLaneShuffleDepth + 2 - NumVariableMasks;
40434 bool AllowBWIVPERMV3 =
40435 (Depth >= BWIVPERMV3ShuffleDepth || HasSlowVariableMask);
40436
40437 // If root was a VPERMV/VPERMV3 node, always allow a variable shuffle.
40438 if ((UnaryShuffle && RootOpc == X86ISD::VPERMV) || RootOpc == X86ISD::VPERMV3)
40439 AllowVariableCrossLaneMask = AllowVariablePerLaneMask = true;
40440
40441 bool MaskContainsZeros = isAnyZero(Mask);
40442
40443 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
40444 // If we have a single input lane-crossing shuffle then lower to VPERMV.
40445 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
40446 if (Subtarget.hasAVX2() &&
40447 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
40448 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
40449 Res = CanonicalizeShuffleInput(MaskVT, V1);
40450 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
40451 return DAG.getBitcast(RootVT, Res);
40452 }
40453 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
40454 if ((Subtarget.hasAVX512() &&
40455 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40456 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40457 (Subtarget.hasBWI() &&
40458 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40459 (Subtarget.hasVBMI() &&
40460 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
40461 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40462 V2 = DAG.getUNDEF(MaskVT);
40463 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40464 return DAG.getBitcast(RootVT, Res);
40465 }
40466 }
40467
40468 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
40469 // vector as the second source (non-VLX will pad to 512-bit shuffles).
40470 if (UnaryShuffle && AllowVariableCrossLaneMask &&
40471 ((Subtarget.hasAVX512() &&
40472 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40473 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40474 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
40475 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40476 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40477 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40478 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40479 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40480 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
40481 for (unsigned i = 0; i != NumMaskElts; ++i)
40482 if (Mask[i] == SM_SentinelZero)
40483 Mask[i] = NumMaskElts + i;
40484 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40485 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
40486 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40487 return DAG.getBitcast(RootVT, Res);
40488 }
40489
40490 // If that failed and either input is extracted then try to combine as a
40491 // shuffle with the larger type.
40493 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40494 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40495 IsMaskedShuffle, DAG, DL, Subtarget))
40496 return WideShuffle;
40497
40498 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40499 // (non-VLX will pad to 512-bit shuffles).
40500 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
40501 ((Subtarget.hasAVX512() &&
40502 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40503 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40504 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
40505 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
40506 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40507 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40508 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40509 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40510 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40511 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40512 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40513 return DAG.getBitcast(RootVT, Res);
40514 }
40515 return SDValue();
40516 }
40517
40518 // See if we can combine a single input shuffle with zeros to a bit-mask,
40519 // which is much simpler than any shuffle.
40520 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
40521 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
40522 TLI.isTypeLegal(MaskVT)) {
40523 APInt Zero = APInt::getZero(MaskEltSizeInBits);
40524 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
40525 APInt UndefElts(NumMaskElts, 0);
40526 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
40527 for (unsigned i = 0; i != NumMaskElts; ++i) {
40528 int M = Mask[i];
40529 if (M == SM_SentinelUndef) {
40530 UndefElts.setBit(i);
40531 continue;
40532 }
40533 if (M == SM_SentinelZero)
40534 continue;
40535 EltBits[i] = AllOnes;
40536 }
40537 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
40538 Res = CanonicalizeShuffleInput(MaskVT, V1);
40539 unsigned AndOpcode =
40541 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
40542 return DAG.getBitcast(RootVT, Res);
40543 }
40544
40545 // If we have a single input shuffle with different shuffle patterns in the
40546 // the 128-bit lanes use the variable mask to VPERMILPS.
40547 // TODO Combine other mask types at higher depths.
40548 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40549 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
40550 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
40551 SmallVector<SDValue, 16> VPermIdx;
40552 for (int M : Mask) {
40553 SDValue Idx =
40554 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
40555 VPermIdx.push_back(Idx);
40556 }
40557 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
40558 Res = CanonicalizeShuffleInput(MaskVT, V1);
40559 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
40560 return DAG.getBitcast(RootVT, Res);
40561 }
40562
40563 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
40564 // to VPERMIL2PD/VPERMIL2PS.
40565 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
40566 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
40567 MaskVT == MVT::v8f32)) {
40568 // VPERMIL2 Operation.
40569 // Bits[3] - Match Bit.
40570 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
40571 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
40572 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
40573 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
40574 SmallVector<int, 8> VPerm2Idx;
40575 unsigned M2ZImm = 0;
40576 for (int M : Mask) {
40577 if (M == SM_SentinelUndef) {
40578 VPerm2Idx.push_back(-1);
40579 continue;
40580 }
40581 if (M == SM_SentinelZero) {
40582 M2ZImm = 2;
40583 VPerm2Idx.push_back(8);
40584 continue;
40585 }
40586 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
40587 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
40588 VPerm2Idx.push_back(Index);
40589 }
40590 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40591 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40592 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
40593 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
40594 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
40595 return DAG.getBitcast(RootVT, Res);
40596 }
40597
40598 // If we have 3 or more shuffle instructions or a chain involving a variable
40599 // mask, we can replace them with a single PSHUFB instruction profitably.
40600 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
40601 // instructions, but in practice PSHUFB tends to be *very* fast so we're
40602 // more aggressive.
40603 if (UnaryShuffle && AllowVariablePerLaneMask &&
40604 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
40605 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
40606 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
40607 SmallVector<SDValue, 16> PSHUFBMask;
40608 int NumBytes = RootVT.getSizeInBits() / 8;
40609 int Ratio = NumBytes / NumMaskElts;
40610 for (int i = 0; i < NumBytes; ++i) {
40611 int M = Mask[i / Ratio];
40612 if (M == SM_SentinelUndef) {
40613 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
40614 continue;
40615 }
40616 if (M == SM_SentinelZero) {
40617 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40618 continue;
40619 }
40620 M = Ratio * M + i % Ratio;
40621 assert((M / 16) == (i / 16) && "Lane crossing detected");
40622 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40623 }
40624 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
40625 Res = CanonicalizeShuffleInput(ByteVT, V1);
40626 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
40627 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
40628 return DAG.getBitcast(RootVT, Res);
40629 }
40630
40631 // With XOP, if we have a 128-bit binary input shuffle we can always combine
40632 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
40633 // slower than PSHUFB on targets that support both.
40634 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
40635 Subtarget.hasXOP()) {
40636 // VPPERM Mask Operation
40637 // Bits[4:0] - Byte Index (0 - 31)
40638 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
40639 SmallVector<SDValue, 16> VPPERMMask;
40640 int NumBytes = 16;
40641 int Ratio = NumBytes / NumMaskElts;
40642 for (int i = 0; i < NumBytes; ++i) {
40643 int M = Mask[i / Ratio];
40644 if (M == SM_SentinelUndef) {
40645 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
40646 continue;
40647 }
40648 if (M == SM_SentinelZero) {
40649 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40650 continue;
40651 }
40652 M = Ratio * M + i % Ratio;
40653 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40654 }
40655 MVT ByteVT = MVT::v16i8;
40656 V1 = CanonicalizeShuffleInput(ByteVT, V1);
40657 V2 = CanonicalizeShuffleInput(ByteVT, V2);
40658 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
40659 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
40660 return DAG.getBitcast(RootVT, Res);
40661 }
40662
40663 // If that failed and either input is extracted then try to combine as a
40664 // shuffle with the larger type.
40666 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40667 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
40668 DAG, DL, Subtarget))
40669 return WideShuffle;
40670
40671 // If we have a dual input shuffle then lower to VPERMV3,
40672 // (non-VLX will pad to 512-bit shuffles)
40673 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40674 ((Subtarget.hasAVX512() &&
40675 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
40676 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
40677 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
40678 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
40679 MaskVT == MVT::v16i32)) ||
40680 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40681 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
40682 MaskVT == MVT::v32i16)) ||
40683 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40684 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
40685 MaskVT == MVT::v64i8)))) {
40686 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40687 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40688 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40689 return DAG.getBitcast(RootVT, Res);
40690 }
40691
40692 // Failed to find any combines.
40693 return SDValue();
40694}
40695
40696// Combine an arbitrary chain of shuffles + extract_subvectors into a single
40697// instruction if possible.
40698//
40699// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40700// type size to attempt to combine:
40701// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40702// -->
40703// extract_subvector(shuffle(x,y,m2),0)
40705 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
40706 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
40707 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
40708 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
40709 const X86Subtarget &Subtarget) {
40710 unsigned NumMaskElts = BaseMask.size();
40711 unsigned NumInputs = Inputs.size();
40712 if (NumInputs == 0)
40713 return SDValue();
40714
40715 unsigned RootSizeInBits = RootVT.getSizeInBits();
40716 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40717 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
40718
40719 // Peek through subvectors to find widest legal vector.
40720 // TODO: Handle ISD::TRUNCATE
40721 unsigned WideSizeInBits = RootSizeInBits;
40722 for (SDValue Input : Inputs) {
40724 while (1) {
40725 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
40726 Input = peekThroughBitcasts(Input.getOperand(0));
40727 continue;
40728 }
40729 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40730 Input.getOperand(0).isUndef() &&
40731 isNullConstant(Input.getOperand(2))) {
40732 Input = peekThroughBitcasts(Input.getOperand(1));
40733 continue;
40734 }
40735 break;
40736 }
40737 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40738 WideSizeInBits < Input.getValueSizeInBits())
40739 WideSizeInBits = Input.getValueSizeInBits();
40740 }
40741
40742 // Bail if we fail to find a source larger than the existing root.
40743 if (WideSizeInBits <= RootSizeInBits ||
40744 (WideSizeInBits % RootSizeInBits) != 0)
40745 return SDValue();
40746
40747 // Create new mask for larger type.
40748 SmallVector<int, 64> WideMask;
40749 growShuffleMask(BaseMask, WideMask, RootSizeInBits, WideSizeInBits);
40750
40751 // Attempt to peek through inputs and adjust mask when we extract from an
40752 // upper subvector.
40753 int AdjustedMasks = 0;
40754 SmallVector<SDValue, 4> WideInputs(Inputs);
40755 for (unsigned I = 0; I != NumInputs; ++I) {
40756 SDValue &Input = WideInputs[I];
40758 while (1) {
40759 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40760 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40761 uint64_t Idx = Input.getConstantOperandVal(1);
40762 if (Idx != 0) {
40763 ++AdjustedMasks;
40764 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40765 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40766
40767 int lo = I * WideMask.size();
40768 int hi = (I + 1) * WideMask.size();
40769 for (int &M : WideMask)
40770 if (lo <= M && M < hi)
40771 M += Idx;
40772 }
40773 Input = peekThroughBitcasts(Input.getOperand(0));
40774 continue;
40775 }
40776 // TODO: Handle insertions into upper subvectors.
40777 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40778 Input.getOperand(0).isUndef() &&
40779 isNullConstant(Input.getOperand(2))) {
40780 Input = peekThroughBitcasts(Input.getOperand(1));
40781 continue;
40782 }
40783 break;
40784 }
40785 }
40786
40787 // Remove unused/repeated shuffle source ops.
40788 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40789 assert(!WideInputs.empty() && "Shuffle with no inputs detected");
40790
40791 // Bail if we're always extracting from the lowest subvectors,
40792 // combineX86ShuffleChain should match this for the current width, or the
40793 // shuffle still references too many inputs.
40794 if (AdjustedMasks == 0 || WideInputs.size() > 2)
40795 return SDValue();
40796
40797 // Minor canonicalization of the accumulated shuffle mask to make it easier
40798 // to match below. All this does is detect masks with sequential pairs of
40799 // elements, and shrink them to the half-width mask. It does this in a loop
40800 // so it will reduce the size of the mask to the minimal width mask which
40801 // performs an equivalent shuffle.
40802 while (WideMask.size() > 1) {
40803 SmallVector<int, 64> WidenedMask;
40804 if (!canWidenShuffleElements(WideMask, WidenedMask))
40805 break;
40806 WideMask = std::move(WidenedMask);
40807 }
40808
40809 // Canonicalization of binary shuffle masks to improve pattern matching by
40810 // commuting the inputs.
40811 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40813 std::swap(WideInputs[0], WideInputs[1]);
40814 }
40815
40816 // Increase depth for every upper subvector we've peeked through.
40817 Depth += AdjustedMasks;
40818
40819 // Attempt to combine wider chain.
40820 // TODO: Can we use a better Root?
40821 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40822 WideInputs.back().getValueSizeInBits()
40823 ? WideInputs.front()
40824 : WideInputs.back();
40825 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
40826 "WideRootSize mismatch");
40827
40828 if (SDValue WideShuffle = combineX86ShuffleChain(
40829 WideInputs, RootOpcode, WideRoot.getSimpleValueType(), WideMask,
40830 Depth, SrcNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40831 IsMaskedShuffle, DAG, SDLoc(WideRoot), Subtarget)) {
40832 WideShuffle = extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits);
40833 return DAG.getBitcast(RootVT, WideShuffle);
40834 }
40835
40836 return SDValue();
40837}
40838
40839// Canonicalize the combined shuffle mask chain with horizontal ops.
40840// NOTE: This may update the Ops and Mask.
40843 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
40844 const X86Subtarget &Subtarget) {
40845 if (Mask.empty() || Ops.empty())
40846 return SDValue();
40847
40849 for (SDValue Op : Ops)
40851
40852 // All ops must be the same horizop + type.
40853 SDValue BC0 = BC[0];
40854 EVT VT0 = BC0.getValueType();
40855 unsigned Opcode0 = BC0.getOpcode();
40856 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
40857 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
40858 }))
40859 return SDValue();
40860
40861 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
40862 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
40863 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
40864 if (!isHoriz && !isPack)
40865 return SDValue();
40866
40867 // Do all ops have a single use?
40868 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
40869 return Op.hasOneUse() &&
40871 });
40872
40873 int NumElts = VT0.getVectorNumElements();
40874 int NumLanes = VT0.getSizeInBits() / 128;
40875 int NumEltsPerLane = NumElts / NumLanes;
40876 int NumHalfEltsPerLane = NumEltsPerLane / 2;
40877 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
40878 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
40879
40880 if (NumEltsPerLane >= 4 &&
40881 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
40882 SmallVector<int> LaneMask, ScaledMask;
40883 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
40884 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
40885 // See if we can remove the shuffle by resorting the HOP chain so that
40886 // the HOP args are pre-shuffled.
40887 // TODO: Generalize to any sized/depth chain.
40888 // TODO: Add support for PACKSS/PACKUS.
40889 if (isHoriz) {
40890 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
40891 auto GetHOpSrc = [&](int M) {
40892 if (M == SM_SentinelUndef)
40893 return DAG.getUNDEF(VT0);
40894 if (M == SM_SentinelZero)
40895 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
40896 SDValue Src0 = BC[M / 4];
40897 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
40898 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
40899 return Src1.getOperand(M % 2);
40900 return SDValue();
40901 };
40902 SDValue M0 = GetHOpSrc(ScaledMask[0]);
40903 SDValue M1 = GetHOpSrc(ScaledMask[1]);
40904 SDValue M2 = GetHOpSrc(ScaledMask[2]);
40905 SDValue M3 = GetHOpSrc(ScaledMask[3]);
40906 if (M0 && M1 && M2 && M3) {
40907 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
40908 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
40909 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40910 }
40911 }
40912 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
40913 if (Ops.size() >= 2) {
40914 SDValue LHS, RHS;
40915 auto GetHOpSrc = [&](int M, int &OutM) {
40916 // TODO: Support SM_SentinelZero
40917 if (M < 0)
40918 return M == SM_SentinelUndef;
40919 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
40920 if (!LHS || LHS == Src) {
40921 LHS = Src;
40922 OutM = (M % 2);
40923 return true;
40924 }
40925 if (!RHS || RHS == Src) {
40926 RHS = Src;
40927 OutM = (M % 2) + 2;
40928 return true;
40929 }
40930 return false;
40931 };
40932 int PostMask[4] = {-1, -1, -1, -1};
40933 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
40934 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
40935 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
40936 GetHOpSrc(ScaledMask[3], PostMask[3])) {
40937 LHS = DAG.getBitcast(SrcVT, LHS);
40938 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
40939 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40940 // Use SHUFPS for the permute so this will work on SSE2 targets,
40941 // shuffle combining and domain handling will simplify this later on.
40942 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
40943 Res = DAG.getBitcast(ShuffleVT, Res);
40944 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
40945 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
40946 }
40947 }
40948 }
40949 }
40950
40951 if (2 < Ops.size())
40952 return SDValue();
40953
40954 SDValue BC1 = BC[BC.size() - 1];
40955 if (Mask.size() == VT0.getVectorNumElements()) {
40956 // Canonicalize binary shuffles of horizontal ops that use the
40957 // same sources to an unary shuffle.
40958 // TODO: Try to perform this fold even if the shuffle remains.
40959 if (Ops.size() == 2) {
40960 auto ContainsOps = [](SDValue HOp, SDValue Op) {
40961 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
40962 };
40963 // Commute if all BC0's ops are contained in BC1.
40964 if (ContainsOps(BC1, BC0.getOperand(0)) &&
40965 ContainsOps(BC1, BC0.getOperand(1))) {
40967 std::swap(Ops[0], Ops[1]);
40968 std::swap(BC0, BC1);
40969 }
40970
40971 // If BC1 can be represented by BC0, then convert to unary shuffle.
40972 if (ContainsOps(BC0, BC1.getOperand(0)) &&
40973 ContainsOps(BC0, BC1.getOperand(1))) {
40974 for (int &M : Mask) {
40975 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
40976 continue;
40977 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
40978 M -= NumElts + (SubLane * NumHalfEltsPerLane);
40979 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
40980 M += NumHalfEltsPerLane;
40981 }
40982 }
40983 }
40984
40985 // Canonicalize unary horizontal ops to only refer to lower halves.
40986 for (int i = 0; i != NumElts; ++i) {
40987 int &M = Mask[i];
40988 if (isUndefOrZero(M))
40989 continue;
40990 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
40991 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
40992 M -= NumHalfEltsPerLane;
40993 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
40994 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
40995 M -= NumHalfEltsPerLane;
40996 }
40997 }
40998
40999 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
41000 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
41001 // represents the LHS/RHS inputs for the lower/upper halves.
41002 SmallVector<int, 16> TargetMask128, WideMask128;
41003 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
41004 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
41005 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
41006 bool SingleOp = (Ops.size() == 1);
41007 if (isPack || OneUseOps ||
41008 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
41009 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
41010 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
41011 Lo = Lo.getOperand(WideMask128[0] & 1);
41012 Hi = Hi.getOperand(WideMask128[1] & 1);
41013 if (SingleOp) {
41014 SDValue Undef = DAG.getUNDEF(SrcVT);
41015 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
41016 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
41017 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
41018 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
41019 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
41020 }
41021 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
41022 }
41023 }
41024
41025 // If we are post-shuffling a 256-bit hop and not requiring the upper
41026 // elements, then try to narrow to a 128-bit hop directly.
41027 SmallVector<int, 16> WideMask64;
41028 if (Ops.size() == 1 && NumLanes == 2 &&
41029 scaleShuffleElements(Mask, 4, WideMask64) &&
41030 isUndefInRange(WideMask64, 2, 2)) {
41031 int M0 = WideMask64[0];
41032 int M1 = WideMask64[1];
41033 if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
41035 unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41036 unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41037 SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
41038 SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
41039 SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
41040 return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
41041 }
41042 }
41043
41044 return SDValue();
41045}
41046
41047// Attempt to constant fold all of the constant source ops.
41048// Returns true if the entire shuffle is folded to a constant.
41049// TODO: Extend this to merge multiple constant Ops and update the mask.
41051 ArrayRef<int> Mask,
41052 ArrayRef<const SDNode *> SrcNodes,
41053 SelectionDAG &DAG, const SDLoc &DL,
41054 const X86Subtarget &Subtarget) {
41055 unsigned SizeInBits = VT.getSizeInBits();
41056 unsigned NumMaskElts = Mask.size();
41057 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
41058 unsigned NumOps = Ops.size();
41059
41060 // Extract constant bits from each source op.
41061 SmallVector<APInt, 16> UndefEltsOps(NumOps);
41063 for (unsigned I = 0; I != NumOps; ++I)
41064 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
41065 RawBitsOps[I],
41066 /*AllowWholeUndefs*/ true,
41067 /*AllowPartialUndefs*/ true))
41068 return SDValue();
41069
41070 // If we're optimizing for size, only fold if at least one of the constants is
41071 // only used once or the combined shuffle has included a variable mask
41072 // shuffle, this is to avoid constant pool bloat.
41073 bool IsOptimizingSize = DAG.shouldOptForSize();
41074 bool HasVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
41075 return isTargetShuffleVariableMask(N->getOpcode());
41076 });
41077 if (IsOptimizingSize && !HasVariableMask &&
41078 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
41079 return SDValue();
41080
41081 // Shuffle the constant bits according to the mask.
41082 APInt UndefElts(NumMaskElts, 0);
41083 APInt ZeroElts(NumMaskElts, 0);
41084 APInt ConstantElts(NumMaskElts, 0);
41085 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
41086 APInt::getZero(MaskSizeInBits));
41087 for (unsigned i = 0; i != NumMaskElts; ++i) {
41088 int M = Mask[i];
41089 if (M == SM_SentinelUndef) {
41090 UndefElts.setBit(i);
41091 continue;
41092 } else if (M == SM_SentinelZero) {
41093 ZeroElts.setBit(i);
41094 continue;
41095 }
41096 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
41097
41098 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
41099 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
41100
41101 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
41102 if (SrcUndefElts[SrcMaskIdx]) {
41103 UndefElts.setBit(i);
41104 continue;
41105 }
41106
41107 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
41108 APInt &Bits = SrcEltBits[SrcMaskIdx];
41109 if (!Bits) {
41110 ZeroElts.setBit(i);
41111 continue;
41112 }
41113
41114 ConstantElts.setBit(i);
41115 ConstantBitData[i] = Bits;
41116 }
41117 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
41118
41119 // Attempt to create a zero vector.
41120 if ((UndefElts | ZeroElts).isAllOnes())
41121 return getZeroVector(VT, Subtarget, DAG, DL);
41122
41123 // Create the constant data.
41124 MVT MaskSVT;
41125 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
41126 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
41127 else
41128 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
41129
41130 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
41131 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
41132 return SDValue();
41133
41134 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
41135 return DAG.getBitcast(VT, CstOp);
41136}
41137
41138namespace llvm {
41139 namespace X86 {
41140 enum {
41142 };
41143 } // namespace X86
41144} // namespace llvm
41145
41146/// Fully generic combining of x86 shuffle instructions.
41147///
41148/// This should be the last combine run over the x86 shuffle instructions. Once
41149/// they have been fully optimized, this will recursively consider all chains
41150/// of single-use shuffle instructions, build a generic model of the cumulative
41151/// shuffle operation, and check for simpler instructions which implement this
41152/// operation. We use this primarily for two purposes:
41153///
41154/// 1) Collapse generic shuffles to specialized single instructions when
41155/// equivalent. In most cases, this is just an encoding size win, but
41156/// sometimes we will collapse multiple generic shuffles into a single
41157/// special-purpose shuffle.
41158/// 2) Look for sequences of shuffle instructions with 3 or more total
41159/// instructions, and replace them with the slightly more expensive SSSE3
41160/// PSHUFB instruction if available. We do this as the last combining step
41161/// to ensure we avoid using PSHUFB if we can implement the shuffle with
41162/// a suitable short sequence of other instructions. The PSHUFB will either
41163/// use a register or have to read from memory and so is slightly (but only
41164/// slightly) more expensive than the other shuffle instructions.
41165///
41166/// Because this is inherently a quadratic operation (for each shuffle in
41167/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
41168/// This should never be an issue in practice as the shuffle lowering doesn't
41169/// produce sequences of more than 8 instructions.
41170///
41171/// FIXME: We will currently miss some cases where the redundant shuffling
41172/// would simplify under the threshold for PSHUFB formation because of
41173/// combine-ordering. To fix this, we should do the redundant instruction
41174/// combining in this recursive walk.
41176 ArrayRef<SDValue> SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT,
41177 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
41178 unsigned MaxDepth, bool AllowVariableCrossLaneMask,
41179 bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG,
41180 const SDLoc &DL, const X86Subtarget &Subtarget) {
41181 assert(!RootMask.empty() &&
41182 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
41183 "Illegal shuffle root mask");
41184 assert(RootVT.isVector() && "Shuffles operate on vector types!");
41185 unsigned RootSizeInBits = RootVT.getSizeInBits();
41186 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41187
41188 // Bound the depth of our recursive combine because this is ultimately
41189 // quadratic in nature.
41190 if (Depth >= MaxDepth)
41191 return SDValue();
41192
41193 // Directly rip through bitcasts to find the underlying operand.
41194 SDValue Op = SrcOps[SrcOpIndex];
41196
41197 EVT VT = Op.getValueType();
41198 if (!VT.isVector() || !VT.isSimple())
41199 return SDValue(); // Bail if we hit a non-simple non-vector.
41200
41201 // FIXME: Just bail on f16 for now.
41202 if (VT.getVectorElementType() == MVT::f16)
41203 return SDValue();
41204
41205 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
41206 "Can only combine shuffles upto size of the root op.");
41207
41208 // Create a demanded elts mask from the referenced elements of Op.
41209 APInt OpDemandedElts = APInt::getZero(RootMask.size());
41210 for (int M : RootMask) {
41211 int BaseIdx = RootMask.size() * SrcOpIndex;
41212 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
41213 OpDemandedElts.setBit(M - BaseIdx);
41214 }
41215 if (RootSizeInBits != VT.getSizeInBits()) {
41216 // Op is smaller than Root - extract the demanded elts for the subvector.
41217 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
41218 unsigned NumOpMaskElts = RootMask.size() / Scale;
41219 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
41220 assert(OpDemandedElts
41221 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
41222 .isZero() &&
41223 "Out of range elements referenced in root mask");
41224 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
41225 }
41226 OpDemandedElts =
41227 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
41228
41229 // Extract target shuffle mask and resolve sentinels and inputs.
41230 SmallVector<int, 64> OpMask;
41231 SmallVector<SDValue, 2> OpInputs;
41232 APInt OpUndef, OpZero;
41233 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
41234 OpZero, DAG, Depth, false)) {
41235 // Shuffle inputs must not be larger than the shuffle result.
41236 // TODO: Relax this for single input faux shuffles (e.g. trunc).
41237 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
41238 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
41239 }))
41240 return SDValue();
41241 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41242 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
41243 !isNullConstant(Op.getOperand(1))) {
41244 SDValue SrcVec = Op.getOperand(0);
41245 int ExtractIdx = Op.getConstantOperandVal(1);
41246 unsigned NumElts = VT.getVectorNumElements();
41247 OpInputs.assign({SrcVec});
41248 OpMask.assign(NumElts, SM_SentinelUndef);
41249 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
41250 OpZero = OpUndef = APInt::getZero(NumElts);
41251 } else {
41252 return SDValue();
41253 }
41254
41255 // If the shuffle result was smaller than the root, we need to adjust the
41256 // mask indices and pad the mask with undefs.
41257 if (RootSizeInBits > VT.getSizeInBits()) {
41258 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
41259 unsigned OpMaskSize = OpMask.size();
41260 if (OpInputs.size() > 1) {
41261 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
41262 for (int &M : OpMask) {
41263 if (M < 0)
41264 continue;
41265 int EltIdx = M % OpMaskSize;
41266 int OpIdx = M / OpMaskSize;
41267 M = (PaddedMaskSize * OpIdx) + EltIdx;
41268 }
41269 }
41270 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
41271 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
41272 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
41273 }
41274
41277
41278 // We don't need to merge masks if the root is empty.
41279 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
41280 if (EmptyRoot) {
41281 // Only resolve zeros if it will remove an input, otherwise we might end
41282 // up in an infinite loop.
41283 bool ResolveKnownZeros = true;
41284 if (!OpZero.isZero()) {
41285 APInt UsedInputs = APInt::getZero(OpInputs.size());
41286 for (int i = 0, e = OpMask.size(); i != e; ++i) {
41287 int M = OpMask[i];
41288 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
41289 continue;
41290 UsedInputs.setBit(M / OpMask.size());
41291 if (UsedInputs.isAllOnes()) {
41292 ResolveKnownZeros = false;
41293 break;
41294 }
41295 }
41296 }
41297 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
41298 ResolveKnownZeros);
41299
41300 Mask = OpMask;
41301 Ops.append(OpInputs.begin(), OpInputs.end());
41302 } else {
41303 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
41304
41305 // Add the inputs to the Ops list, avoiding duplicates.
41306 Ops.append(SrcOps.begin(), SrcOps.end());
41307
41308 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
41309 // Attempt to find an existing match.
41311 for (int i = 0, e = Ops.size(); i < e; ++i)
41312 if (InputBC == peekThroughBitcasts(Ops[i]))
41313 return i;
41314 // Match failed - should we replace an existing Op?
41315 if (InsertionPoint >= 0) {
41317 return InsertionPoint;
41318 }
41319 // Add to the end of the Ops list.
41320 Ops.push_back(Input);
41321 return Ops.size() - 1;
41322 };
41323
41324 SmallVector<int, 2> OpInputIdx;
41325 for (SDValue OpInput : OpInputs)
41326 OpInputIdx.push_back(
41327 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
41328
41329 assert(((RootMask.size() > OpMask.size() &&
41330 RootMask.size() % OpMask.size() == 0) ||
41331 (OpMask.size() > RootMask.size() &&
41332 OpMask.size() % RootMask.size() == 0) ||
41333 OpMask.size() == RootMask.size()) &&
41334 "The smaller number of elements must divide the larger.");
41335
41336 // This function can be performance-critical, so we rely on the power-of-2
41337 // knowledge that we have about the mask sizes to replace div/rem ops with
41338 // bit-masks and shifts.
41340 "Non-power-of-2 shuffle mask sizes");
41342 "Non-power-of-2 shuffle mask sizes");
41343 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
41344 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
41345
41346 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
41347 unsigned RootRatio =
41348 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
41349 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
41350 assert((RootRatio == 1 || OpRatio == 1) &&
41351 "Must not have a ratio for both incoming and op masks!");
41352
41353 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
41354 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
41355 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
41356 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
41357 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
41358
41359 Mask.resize(MaskWidth, SM_SentinelUndef);
41360
41361 // Merge this shuffle operation's mask into our accumulated mask. Note that
41362 // this shuffle's mask will be the first applied to the input, followed by
41363 // the root mask to get us all the way to the root value arrangement. The
41364 // reason for this order is that we are recursing up the operation chain.
41365 for (unsigned i = 0; i < MaskWidth; ++i) {
41366 unsigned RootIdx = i >> RootRatioLog2;
41367 if (RootMask[RootIdx] < 0) {
41368 // This is a zero or undef lane, we're done.
41369 Mask[i] = RootMask[RootIdx];
41370 continue;
41371 }
41372
41373 unsigned RootMaskedIdx =
41374 RootRatio == 1
41375 ? RootMask[RootIdx]
41376 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
41377
41378 // Just insert the scaled root mask value if it references an input other
41379 // than the SrcOp we're currently inserting.
41380 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
41381 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
41382 Mask[i] = RootMaskedIdx;
41383 continue;
41384 }
41385
41386 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
41387 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
41388 if (OpMask[OpIdx] < 0) {
41389 // The incoming lanes are zero or undef, it doesn't matter which ones we
41390 // are using.
41391 Mask[i] = OpMask[OpIdx];
41392 continue;
41393 }
41394
41395 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
41396 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
41397 : (OpMask[OpIdx] << OpRatioLog2) +
41398 (RootMaskedIdx & (OpRatio - 1));
41399
41400 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
41401 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
41402 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
41403 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
41404
41405 Mask[i] = OpMaskedIdx;
41406 }
41407 }
41408
41409 // Peek through any free bitcasts to insert_subvector vector widenings or
41410 // extract_subvector nodes back to root size.
41411 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
41412 for (auto [I, Op] : enumerate(Ops)) {
41413 SDValue BC = Op;
41414 while (1) {
41415 if (BC.getOpcode() == ISD::BITCAST && BC.hasOneUse()) {
41416 BC = BC.getOperand(0);
41417 continue;
41418 }
41419 if (BC.getOpcode() == ISD::INSERT_SUBVECTOR &&
41420 BC.getOperand(0).isUndef() && isNullConstant(BC.getOperand(2))) {
41421 // Set out of bounds mask indices to undef.
41422 Op = BC = BC.getOperand(1);
41423 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
41424 int Lo = I * Mask.size();
41425 int Hi = (I + 1) * Mask.size();
41426 int NewHi = Lo + (Mask.size() / Scale);
41427 for (int &M : Mask) {
41428 if (Lo <= M && NewHi <= M && M < Hi)
41429 M = SM_SentinelUndef;
41430 }
41431 continue;
41432 }
41433 if (BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41434 (RootSizeInBits % BC.getOperand(0).getValueSizeInBits()) == 0 &&
41435 isNullConstant(BC.getOperand(1))) {
41436 Op = BC = BC.getOperand(0);
41437 continue;
41438 }
41439 break;
41440 }
41441 }
41442
41443 // Remove unused/repeated shuffle source ops.
41445
41446 // Handle the all undef/zero/ones cases early.
41447 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41448 return DAG.getUNDEF(RootVT);
41449 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41450 return getZeroVector(RootVT, Subtarget, DAG, DL);
41451 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41453 return getOnesVector(RootVT, DAG, DL);
41454
41455 assert(!Ops.empty() && "Shuffle with no inputs detected");
41456
41457 // Update the list of shuffle nodes that have been combined so far.
41458 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes);
41459 CombinedNodes.push_back(Op.getNode());
41460
41461 // See if we can recurse into each shuffle source op (if it's a target
41462 // shuffle). The source op should only be generally combined if it either has
41463 // a single use (i.e. current Op) or all its users have already been combined,
41464 // if not then we can still combine but should prevent generation of variable
41465 // shuffles to avoid constant pool bloat.
41466 // Don't recurse if we already have more source ops than we can combine in
41467 // the remaining recursion depth.
41468 if (Ops.size() < (MaxDepth - Depth)) {
41469 for (int i = 0, e = Ops.size(); i < e; ++i) {
41470 // For empty roots, we need to resolve zeroable elements before combining
41471 // them with other shuffles.
41472 SmallVector<int, 64> ResolvedMask = Mask;
41473 if (EmptyRoot)
41474 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
41475 bool AllowCrossLaneVar = false;
41476 bool AllowPerLaneVar = false;
41477 if (Ops[i].getNode()->hasOneUse() ||
41478 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
41479 AllowCrossLaneVar = AllowVariableCrossLaneMask;
41480 AllowPerLaneVar = AllowVariablePerLaneMask;
41481 }
41483 Ops, i, RootOpc, RootVT, ResolvedMask, CombinedNodes, Depth + 1,
41484 MaxDepth, AllowCrossLaneVar, AllowPerLaneVar, IsMaskedShuffle,
41485 DAG, DL, Subtarget))
41486 return Res;
41487 }
41488 }
41489
41490 // Attempt to constant fold all of the constant source ops.
41492 RootVT, Ops, Mask, CombinedNodes, DAG, DL, Subtarget))
41493 return Cst;
41494
41495 // If constant fold failed and we only have constants - then we have
41496 // multiple uses by a single non-variable shuffle - just bail.
41497 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
41498 APInt UndefElts;
41499 SmallVector<APInt> RawBits;
41500 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
41501 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41502 RawBits,
41503 /*AllowWholeUndefs*/ true,
41504 /*AllowPartialUndefs*/ true);
41505 })) {
41506 return SDValue();
41507 }
41508
41509 // Canonicalize the combined shuffle mask chain with horizontal ops.
41510 // NOTE: This will update the Ops and Mask.
41512 Ops, Mask, RootSizeInBits, DL, DAG, Subtarget))
41513 return DAG.getBitcast(RootVT, HOp);
41514
41515 // Try to refine our inputs given our knowledge of target shuffle mask.
41516 for (auto I : enumerate(Ops)) {
41517 int OpIdx = I.index();
41518 SDValue &Op = I.value();
41519
41520 // What range of shuffle mask element values results in picking from Op?
41521 int Lo = OpIdx * Mask.size();
41522 int Hi = Lo + Mask.size();
41523
41524 // Which elements of Op do we demand, given the mask's granularity?
41525 APInt OpDemandedElts(Mask.size(), 0);
41526 for (int MaskElt : Mask) {
41527 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
41528 int OpEltIdx = MaskElt - Lo;
41529 OpDemandedElts.setBit(OpEltIdx);
41530 }
41531 }
41532
41533 // Is the shuffle result smaller than the root?
41534 if (Op.getValueSizeInBits() < RootSizeInBits) {
41535 // We padded the mask with undefs. But we now need to undo that.
41536 unsigned NumExpectedVectorElts = Mask.size();
41537 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
41538 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
41539 assert(!OpDemandedElts.extractBits(
41540 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
41541 "Demanding the virtual undef widening padding?");
41542 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
41543 }
41544
41545 // The Op itself may be of different VT, so we need to scale the mask.
41546 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
41547 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
41548
41549 // Can this operand be simplified any further, given it's demanded elements?
41551 Op, OpScaledDemandedElts, DAG))
41552 Op = NewOp;
41553 }
41554 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
41555
41556 // Widen any subvector shuffle inputs we've collected.
41557 // TODO: Remove this to avoid generating temporary nodes, we should only
41558 // widen once combineX86ShuffleChain has found a match.
41559 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
41560 return Op.getValueSizeInBits() < RootSizeInBits;
41561 })) {
41562 for (SDValue &Op : Ops)
41563 if (Op.getValueSizeInBits() < RootSizeInBits)
41564 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
41565 RootSizeInBits);
41566 // Reresolve - we might have repeated subvector sources.
41568 }
41569
41570 // We can only combine unary and binary shuffle mask cases.
41571 if (Ops.size() <= 2) {
41572 // Minor canonicalization of the accumulated shuffle mask to make it easier
41573 // to match below. All this does is detect masks with sequential pairs of
41574 // elements, and shrink them to the half-width mask. It does this in a loop
41575 // so it will reduce the size of the mask to the minimal width mask which
41576 // performs an equivalent shuffle.
41577 while (Mask.size() > 1) {
41578 SmallVector<int, 64> WidenedMask;
41579 if (!canWidenShuffleElements(Mask, WidenedMask))
41580 break;
41581 Mask = std::move(WidenedMask);
41582 }
41583
41584 // Canonicalization of binary shuffle masks to improve pattern matching by
41585 // commuting the inputs.
41586 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
41588 std::swap(Ops[0], Ops[1]);
41589 }
41590
41591 // Try to combine into a single shuffle instruction.
41592 if (SDValue Shuffle = combineX86ShuffleChain(
41593 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41594 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
41595 IsMaskedShuffle, DAG, DL, Subtarget))
41596 return Shuffle;
41597
41598 // If all the operands come from the same larger vector, fallthrough and try
41599 // to use combineX86ShuffleChainWithExtract.
41602 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41603 (RootSizeInBits / Mask.size()) != 64 ||
41604 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41605 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41606 LHS.getOperand(0) != RHS.getOperand(0))
41607 return SDValue();
41608 }
41609
41610 // If that failed and any input is extracted then try to combine as a
41611 // shuffle with the larger type.
41613 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41614 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
41615 DAG, DL, Subtarget);
41616}
41617
41618/// Helper entry wrapper to combineX86ShufflesRecursively.
41620 const X86Subtarget &Subtarget) {
41622 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), {0}, {}, /*Depth=*/0,
41623 X86::MaxShuffleCombineDepth, /*AllowVariableCrossLaneMask=*/true,
41624 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget), DAG,
41625 SDLoc(Op), Subtarget);
41626}
41627
41628/// Get the PSHUF-style mask from PSHUF node.
41629///
41630/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
41631/// PSHUF-style masks that can be reused with such instructions.
41633 MVT VT = N.getSimpleValueType();
41636 bool HaveMask = getTargetShuffleMask(N, false, Ops, Mask);
41637 (void)HaveMask;
41638 assert(HaveMask);
41639
41640 // If we have more than 128-bits, only the low 128-bits of shuffle mask
41641 // matter. Check that the upper masks are repeats and remove them.
41642 if (VT.getSizeInBits() > 128) {
41643 int LaneElts = 128 / VT.getScalarSizeInBits();
41644#ifndef NDEBUG
41645 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
41646 for (int j = 0; j < LaneElts; ++j)
41647 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
41648 "Mask doesn't repeat in high 128-bit lanes!");
41649#endif
41650 Mask.resize(LaneElts);
41651 }
41652
41653 switch (N.getOpcode()) {
41654 case X86ISD::PSHUFD:
41655 return Mask;
41656 case X86ISD::PSHUFLW:
41657 Mask.resize(4);
41658 return Mask;
41659 case X86ISD::PSHUFHW:
41660 Mask.erase(Mask.begin(), Mask.begin() + 4);
41661 for (int &M : Mask)
41662 M -= 4;
41663 return Mask;
41664 default:
41665 llvm_unreachable("No valid shuffle instruction found!");
41666 }
41667}
41668
41669/// Get the expanded blend mask from a BLENDI node.
41670/// For v16i16 nodes, this will splat the repeated i8 mask.
41672 assert(V.getOpcode() == X86ISD::BLENDI && "Unknown blend shuffle");
41673 unsigned NumElts = V.getSimpleValueType().getVectorNumElements();
41674 APInt Mask = V.getConstantOperandAPInt(2);
41675 if (Mask.getBitWidth() > NumElts)
41676 Mask = Mask.trunc(NumElts);
41677 if (NumElts == 16) {
41678 assert(Mask.getBitWidth() == 8 && "Unexpected v16i16 blend mask width");
41679 Mask = APInt::getSplat(16, Mask);
41680 }
41681 assert(Mask.getBitWidth() == NumElts && "Unexpected blend mask width");
41682 return Mask;
41683}
41684
41685/// Search for a combinable shuffle across a chain ending in pshufd.
41686///
41687/// We walk up the chain and look for a combinable shuffle, skipping over
41688/// shuffles that we could hoist this shuffle's transformation past without
41689/// altering anything.
41692 const SDLoc &DL,
41693 SelectionDAG &DAG) {
41694 assert(N.getOpcode() == X86ISD::PSHUFD &&
41695 "Called with something other than an x86 128-bit half shuffle!");
41696
41697 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
41698 // of the shuffles in the chain so that we can form a fresh chain to replace
41699 // this one.
41701 SDValue V = N.getOperand(0);
41702 for (; V.hasOneUse(); V = V.getOperand(0)) {
41703 switch (V.getOpcode()) {
41704 default:
41705 return SDValue(); // Nothing combined!
41706
41707 case ISD::BITCAST:
41708 // Skip bitcasts as we always know the type for the target specific
41709 // instructions.
41710 continue;
41711
41712 case X86ISD::PSHUFD:
41713 // Found another dword shuffle.
41714 break;
41715
41716 case X86ISD::PSHUFLW:
41717 // Check that the low words (being shuffled) are the identity in the
41718 // dword shuffle, and the high words are self-contained.
41719 if (Mask[0] != 0 || Mask[1] != 1 ||
41720 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
41721 return SDValue();
41722
41723 Chain.push_back(V);
41724 continue;
41725
41726 case X86ISD::PSHUFHW:
41727 // Check that the high words (being shuffled) are the identity in the
41728 // dword shuffle, and the low words are self-contained.
41729 if (Mask[2] != 2 || Mask[3] != 3 ||
41730 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
41731 return SDValue();
41732
41733 Chain.push_back(V);
41734 continue;
41735
41736 case X86ISD::UNPCKL:
41737 case X86ISD::UNPCKH:
41738 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
41739 // shuffle into a preceding word shuffle.
41740 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
41741 V.getSimpleValueType().getVectorElementType() != MVT::i16)
41742 return SDValue();
41743
41744 // Search for a half-shuffle which we can combine with.
41745 unsigned CombineOp =
41746 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
41747 if (V.getOperand(0) != V.getOperand(1) ||
41748 !V->isOnlyUserOf(V.getOperand(0).getNode()))
41749 return SDValue();
41750 Chain.push_back(V);
41751 V = V.getOperand(0);
41752 do {
41753 switch (V.getOpcode()) {
41754 default:
41755 return SDValue(); // Nothing to combine.
41756
41757 case X86ISD::PSHUFLW:
41758 case X86ISD::PSHUFHW:
41759 if (V.getOpcode() == CombineOp)
41760 break;
41761
41762 Chain.push_back(V);
41763
41764 [[fallthrough]];
41765 case ISD::BITCAST:
41766 V = V.getOperand(0);
41767 continue;
41768 }
41769 break;
41770 } while (V.hasOneUse());
41771 break;
41772 }
41773 // Break out of the loop if we break out of the switch.
41774 break;
41775 }
41776
41777 if (!V.hasOneUse())
41778 // We fell out of the loop without finding a viable combining instruction.
41779 return SDValue();
41780
41781 // Merge this node's mask and our incoming mask.
41783 for (int &M : Mask)
41784 M = VMask[M];
41785 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
41786 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41787
41788 // Rebuild the chain around this new shuffle.
41789 while (!Chain.empty()) {
41790 SDValue W = Chain.pop_back_val();
41791
41792 if (V.getValueType() != W.getOperand(0).getValueType())
41793 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
41794
41795 switch (W.getOpcode()) {
41796 default:
41797 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
41798
41799 case X86ISD::UNPCKL:
41800 case X86ISD::UNPCKH:
41801 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
41802 break;
41803
41804 case X86ISD::PSHUFD:
41805 case X86ISD::PSHUFLW:
41806 case X86ISD::PSHUFHW:
41807 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
41808 break;
41809 }
41810 }
41811 if (V.getValueType() != N.getValueType())
41812 V = DAG.getBitcast(N.getValueType(), V);
41813
41814 // Return the new chain to replace N.
41815 return V;
41816}
41817
41818// Attempt to commute shufps LHS loads:
41819// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
41821 SelectionDAG &DAG) {
41822 // TODO: Add vXf64 support.
41823 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
41824 return SDValue();
41825
41826 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
41827 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
41828 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
41829 return SDValue();
41830 SDValue N0 = V.getOperand(0);
41831 SDValue N1 = V.getOperand(1);
41832 unsigned Imm = V.getConstantOperandVal(2);
41833 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
41834 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
41836 return SDValue();
41837 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
41838 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
41839 DAG.getTargetConstant(Imm, DL, MVT::i8));
41840 };
41841
41842 switch (N.getOpcode()) {
41843 case X86ISD::VPERMILPI:
41844 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
41845 unsigned Imm = N.getConstantOperandVal(1);
41846 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
41847 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41848 }
41849 break;
41850 case X86ISD::SHUFP: {
41851 SDValue N0 = N.getOperand(0);
41852 SDValue N1 = N.getOperand(1);
41853 unsigned Imm = N.getConstantOperandVal(2);
41854 if (N0 == N1) {
41855 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
41856 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
41857 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41858 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
41859 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
41860 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
41861 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
41862 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
41863 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
41864 }
41865 break;
41866 }
41867 }
41868
41869 return SDValue();
41870}
41871
41872// Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
41873// iff we don't demand the same element index for both X and Y.
41874static SDValue
41876 const APInt &DemandedElts, SelectionDAG &DAG,
41877 const X86Subtarget &Subtarget, const SDLoc &DL) {
41878 assert(isBlendOrUndef(BlendMask) && "Blend shuffle expected");
41879 if (!N0.hasOneUse() || !N1.hasOneUse())
41880 return SDValue();
41881
41882 unsigned NumElts = VT.getVectorNumElements();
41885
41886 // See if both operands are shuffles, and that we can scale the shuffle masks
41887 // to the same width as the blend mask.
41888 // TODO: Support SM_SentinelZero?
41889 SmallVector<SDValue, 2> Ops0, Ops1;
41890 SmallVector<int, 32> Mask0, Mask1, ScaledMask0, ScaledMask1;
41891 if (!getTargetShuffleMask(BC0, /*AllowSentinelZero=*/false, Ops0, Mask0) ||
41892 !getTargetShuffleMask(BC1, /*AllowSentinelZero=*/false, Ops1, Mask1) ||
41893 !scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||
41894 !scaleShuffleElements(Mask1, NumElts, ScaledMask1))
41895 return SDValue();
41896
41897 // Determine the demanded elts from both permutes.
41898 APInt Demanded0, DemandedLHS0, DemandedRHS0;
41899 APInt Demanded1, DemandedLHS1, DemandedRHS1;
41900 if (!getShuffleDemandedElts(NumElts, BlendMask, DemandedElts, Demanded0,
41901 Demanded1,
41902 /*AllowUndefElts=*/true) ||
41903 !getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,
41904 DemandedRHS0, /*AllowUndefElts=*/true) ||
41905 !getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,
41906 DemandedRHS1, /*AllowUndefElts=*/true))
41907 return SDValue();
41908
41909 // Confirm that we only use a single operand from both permutes and that we
41910 // don't demand the same index from both.
41911 if (!DemandedRHS0.isZero() || !DemandedRHS1.isZero() ||
41912 DemandedLHS0.intersects(DemandedLHS1))
41913 return SDValue();
41914
41915 // Use the permute demanded elts masks as the new blend mask.
41916 // Create the new permute mask as a blend of the 2 original permute masks.
41917 SmallVector<int, 32> NewBlendMask(NumElts, SM_SentinelUndef);
41918 SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);
41919 for (unsigned I = 0; I != NumElts; ++I) {
41920 if (Demanded0[I]) {
41921 int M = ScaledMask0[I];
41922 if (0 <= M) {
41923 assert(isUndefOrEqual(NewBlendMask[M], M) &&
41924 "BlendMask demands LHS AND RHS");
41925 NewBlendMask[M] = M;
41926 NewPermuteMask[I] = M;
41927 }
41928 } else if (Demanded1[I]) {
41929 int M = ScaledMask1[I];
41930 if (0 <= M) {
41931 assert(isUndefOrEqual(NewBlendMask[M], M + NumElts) &&
41932 "BlendMask demands LHS AND RHS");
41933 NewBlendMask[M] = M + NumElts;
41934 NewPermuteMask[I] = M;
41935 }
41936 }
41937 }
41938 assert(isBlendOrUndef(NewBlendMask) && "Bad blend");
41939 assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
41940
41941 // v16i16 shuffles can explode in complexity very easily, only accept them if
41942 // the blend mask is the same in the 128-bit subvectors (or can widen to
41943 // v8i32) and the permute can be widened as well.
41944 if (VT == MVT::v16i16) {
41945 if (!is128BitLaneRepeatedShuffleMask(VT, NewBlendMask) &&
41946 !canWidenShuffleElements(NewBlendMask))
41947 return SDValue();
41948 if (!canWidenShuffleElements(NewPermuteMask))
41949 return SDValue();
41950 }
41951
41952 // Don't introduce lane-crossing permutes without AVX2, unless it can be
41953 // widened to a lane permute (vperm2f128).
41954 if (VT.is256BitVector() && !Subtarget.hasAVX2() &&
41956 NewPermuteMask) &&
41957 !canScaleShuffleElements(NewPermuteMask, 2))
41958 return SDValue();
41959
41960 SDValue NewBlend =
41961 DAG.getVectorShuffle(VT, DL, DAG.getBitcast(VT, Ops0[0]),
41962 DAG.getBitcast(VT, Ops1[0]), NewBlendMask);
41963 return DAG.getVectorShuffle(VT, DL, NewBlend, DAG.getUNDEF(VT),
41964 NewPermuteMask);
41965}
41966
41967// TODO - move this to TLI like isBinOp?
41968static bool isUnaryOp(unsigned Opcode) {
41969 switch (Opcode) {
41970 case ISD::CTLZ:
41971 case ISD::CTTZ:
41972 case ISD::CTPOP:
41973 return true;
41974 }
41975 return false;
41976}
41977
41978// Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
41979// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
41981 const SDLoc &DL) {
41982 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41983 EVT ShuffleVT = N.getValueType();
41984 unsigned Opc = N.getOpcode();
41985
41986 auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true) {
41987 // AllZeros/AllOnes constants are freely shuffled and will peek through
41988 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
41989 // merge with target shuffles if it has one use so shuffle combining is
41990 // likely to kick in. Shuffles of splats are expected to be removed.
41991 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
41992 ISD::isBuildVectorAllZeros(Op.getNode()) ||
41996 (Op.getOpcode() == Opc && Op->hasOneUse()) ||
41997 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
41998 (Op.getOpcode() == ISD::CONCAT_VECTORS && Op->hasOneUse()) ||
41999 (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
42000 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
42001 };
42002 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
42003 // Ensure we only shuffle whole vector src elements, unless its a logical
42004 // binops where we can more aggressively move shuffles from dst to src.
42005 return isLogicOp(BinOp) ||
42006 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
42007 };
42008
42009 switch (Opc) {
42010 // Unary and Unary+Permute Shuffles.
42011 case X86ISD::PSHUFB: {
42012 // Don't merge PSHUFB if it contains zero'd elements.
42013 SmallVector<int> Mask;
42015 if (!getTargetShuffleMask(N, false, Ops, Mask))
42016 break;
42017 [[fallthrough]];
42018 }
42019 case X86ISD::VBROADCAST:
42020 case X86ISD::MOVDDUP:
42021 case X86ISD::PSHUFD:
42022 case X86ISD::PSHUFHW:
42023 case X86ISD::PSHUFLW:
42024 case X86ISD::VPERMV:
42025 case X86ISD::VPERMI:
42026 case X86ISD::VPERMILPI: {
42027 unsigned SrcIdx = Opc == X86ISD::VPERMV ? 1 : 0;
42028 if (N.getOperand(SrcIdx).getValueType() == ShuffleVT &&
42029 N->isOnlyUserOf(N.getOperand(SrcIdx).getNode())) {
42030 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(SrcIdx));
42031 unsigned SrcOpcode = N0.getOpcode();
42032 EVT OpVT = N0.getValueType();
42033 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
42036 bool FoldShuf = Opc != X86ISD::VPERMI && Opc != X86ISD::VPERMV;
42037 if (IsMergeableWithShuffle(Op00, FoldShuf) ||
42038 IsMergeableWithShuffle(Op01, FoldShuf)) {
42039 SDValue LHS, RHS;
42040 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42041 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42042 if (Opc == X86ISD::VPERMV) {
42043 LHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op00);
42044 RHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op01);
42045 } else if (N.getNumOperands() == 2) {
42046 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
42047 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
42048 } else {
42049 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
42050 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
42051 }
42052 return DAG.getBitcast(ShuffleVT,
42053 DAG.getNode(SrcOpcode, DL, OpVT,
42054 DAG.getBitcast(OpVT, LHS),
42055 DAG.getBitcast(OpVT, RHS)));
42056 }
42057 }
42058 if (SrcOpcode == ISD::SINT_TO_FP && IsSafeToMoveShuffle(N0, SrcOpcode) &&
42059 OpVT.getScalarSizeInBits() ==
42061 SDValue Res = DAG.getBitcast(ShuffleVT, N0.getOperand(0));
42062 if (Opc == X86ISD::VPERMV)
42063 Res = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Res);
42064 else if (N.getNumOperands() == 2)
42065 Res = DAG.getNode(Opc, DL, ShuffleVT, Res, N.getOperand(1));
42066 else
42067 Res = DAG.getNode(Opc, DL, ShuffleVT, Res);
42068 Res = DAG.getBitcast(N0.getOperand(0).getValueType(), Res);
42069 return DAG.getBitcast(ShuffleVT, DAG.getNode(SrcOpcode, DL, OpVT, Res));
42070 }
42071 }
42072 break;
42073 }
42074 // Binary and Binary+Permute Shuffles.
42075 case X86ISD::INSERTPS: {
42076 // Don't merge INSERTPS if it contains zero'd elements.
42077 unsigned InsertPSMask = N.getConstantOperandVal(2);
42078 unsigned ZeroMask = InsertPSMask & 0xF;
42079 if (ZeroMask != 0)
42080 break;
42081 [[fallthrough]];
42082 }
42083 case X86ISD::MOVSD:
42084 case X86ISD::MOVSS:
42085 case X86ISD::BLENDI:
42086 case X86ISD::SHUFP:
42087 case X86ISD::UNPCKH:
42088 case X86ISD::UNPCKL: {
42089 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
42090 N->isOnlyUserOf(N.getOperand(1).getNode())) {
42091 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
42092 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
42093 unsigned SrcOpcode = N0.getOpcode();
42094 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42095 N0.getValueType() == N1.getValueType() &&
42096 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42097 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42102 // Ensure the total number of shuffles doesn't increase by folding this
42103 // shuffle through to the source ops.
42104 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
42105 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
42106 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
42107 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
42108 SDValue LHS, RHS;
42109 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42110 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42111 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42112 Op11 = DAG.getBitcast(ShuffleVT, Op11);
42113 if (N.getNumOperands() == 3) {
42114 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42115 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
42116 } else {
42117 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42118 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
42119 }
42120 EVT OpVT = N0.getValueType();
42121 return DAG.getBitcast(ShuffleVT,
42122 DAG.getNode(SrcOpcode, DL, OpVT,
42123 DAG.getBitcast(OpVT, LHS),
42124 DAG.getBitcast(OpVT, RHS)));
42125 }
42126 }
42127 if (isUnaryOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42128 N0.getValueType() == N1.getValueType() &&
42129 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42130 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42133 SDValue Res;
42134 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42135 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42136 if (N.getNumOperands() == 3) {
42137 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42138 } else {
42139 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42140 }
42141 EVT OpVT = N0.getValueType();
42142 return DAG.getBitcast(
42143 ShuffleVT,
42144 DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res)));
42145 }
42146 // TODO: We can generalize this for other shuffles/conversions.
42147 if (Opc == X86ISD::UNPCKL && SrcOpcode == X86ISD::CVTPH2PS &&
42148 N1.getOpcode() == SrcOpcode &&
42149 N0.getValueType() == N1.getValueType() &&
42150 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
42151 ShuffleVT.getScalarSizeInBits() == N0.getScalarValueSizeInBits() &&
42152 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42153 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42154 EVT OpSrcVT = N0.getOperand(0).getValueType();
42155 EVT OpDstVT = N0.getValueType();
42156 SDValue Res =
42157 DAG.getNode(Opc, DL, OpSrcVT, N0.getOperand(0), N1.getOperand(0));
42158 return DAG.getBitcast(ShuffleVT,
42159 DAG.getNode(SrcOpcode, DL, OpDstVT, Res));
42160 }
42161 }
42162 break;
42163 }
42164 }
42165 return SDValue();
42166}
42167
42168/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
42170 SelectionDAG &DAG,
42171 const SDLoc &DL) {
42172 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
42173
42174 MVT VT = V.getSimpleValueType();
42175 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
42176 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
42177 unsigned SrcOpc0 = Src0.getOpcode();
42178 unsigned SrcOpc1 = Src1.getOpcode();
42179 EVT SrcVT0 = Src0.getValueType();
42180 EVT SrcVT1 = Src1.getValueType();
42181
42182 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
42183 return SDValue();
42184
42185 switch (SrcOpc0) {
42186 case X86ISD::MOVDDUP: {
42187 SDValue LHS = Src0.getOperand(0);
42188 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42189 SDValue Res =
42190 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
42191 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
42192 return DAG.getBitcast(VT, Res);
42193 }
42194 case X86ISD::VPERMILPI:
42195 // TODO: Handle v4f64 permutes with different low/high lane masks.
42196 if (SrcVT0 == MVT::v4f64) {
42197 uint64_t Mask = Src0.getConstantOperandVal(1);
42198 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
42199 break;
42200 }
42201 [[fallthrough]];
42202 case X86ISD::VSHLI:
42203 case X86ISD::VSRLI:
42204 case X86ISD::VSRAI:
42205 case X86ISD::PSHUFD:
42206 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
42207 SDValue LHS = Src0.getOperand(0);
42208 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42209 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
42210 V.getOperand(2));
42211 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
42212 return DAG.getBitcast(VT, Res);
42213 }
42214 break;
42215 }
42216
42217 return SDValue();
42218}
42219
42220/// Try to combine x86 target specific shuffles.
42222 SelectionDAG &DAG,
42224 const X86Subtarget &Subtarget) {
42225 using namespace SDPatternMatch;
42226
42227 MVT VT = N.getSimpleValueType();
42228 unsigned NumElts = VT.getVectorNumElements();
42230 unsigned Opcode = N.getOpcode();
42231 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42232
42233 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
42234 return R;
42235
42236 // Handle specific target shuffles.
42237 switch (Opcode) {
42238 case X86ISD::MOVDDUP: {
42239 SDValue Src = N.getOperand(0);
42240 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
42241 if (VT == MVT::v2f64 && Src.hasOneUse() &&
42242 ISD::isNormalLoad(Src.getNode())) {
42243 LoadSDNode *LN = cast<LoadSDNode>(Src);
42244 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
42245 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
42246 DCI.CombineTo(N.getNode(), Movddup);
42247 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42249 return N; // Return N so it doesn't get rechecked!
42250 }
42251 }
42252
42253 return SDValue();
42254 }
42255 case X86ISD::VBROADCAST: {
42256 SDValue Src = N.getOperand(0);
42257 SDValue BC = peekThroughBitcasts(Src);
42258 EVT SrcVT = Src.getValueType();
42259 EVT BCVT = BC.getValueType();
42260
42261 // If broadcasting from another shuffle, attempt to simplify it.
42262 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
42263 if (isTargetShuffle(BC.getOpcode()) &&
42264 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
42265 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
42266 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
42268 for (unsigned i = 0; i != Scale; ++i)
42269 DemandedMask[i] = i;
42271 {BC}, 0, BC.getOpcode(), BC.getSimpleValueType(), DemandedMask,
42272 {}, /*Depth=*/0, X86::MaxShuffleCombineDepth,
42273 /*AllowVariableCrossLaneMask=*/true,
42274 /*AllowVariablePerLaneMask=*/true,
42275 /*IsMaskedShuffle=*/false, DAG, DL, Subtarget))
42276 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42277 DAG.getBitcast(SrcVT, Res));
42278 }
42279
42280 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
42281 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
42282 if (Src.getOpcode() == ISD::BITCAST &&
42283 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
42284 TLI.isTypeLegal(BCVT) &&
42286 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
42287 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
42289 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42290 }
42291
42292 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
42293 // If we're re-broadcasting a smaller type then broadcast with that type and
42294 // bitcast.
42295 // TODO: Do this for any splat?
42296 if (Src.getOpcode() == ISD::BITCAST &&
42297 (BC.getOpcode() == X86ISD::VBROADCAST ||
42299 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
42300 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
42301 MVT NewVT =
42303 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
42304 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42305 }
42306
42307 // Reduce broadcast source vector to lowest 128-bits.
42308 if (SrcVT.getSizeInBits() > 128)
42309 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42310 extract128BitVector(Src, 0, DAG, DL));
42311
42312 // broadcast(scalar_to_vector(x)) -> broadcast(x).
42313 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42314 Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())
42315 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42316
42317 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
42318 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
42319 isNullConstant(Src.getOperand(1)) &&
42320 Src.getValueType() ==
42321 Src.getOperand(0).getValueType().getScalarType() &&
42322 TLI.isTypeLegal(Src.getOperand(0).getValueType()))
42323 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42324
42325 // Share broadcast with the longest vector and extract low subvector (free).
42326 // Ensure the same SDValue from the SDNode use is being used.
42327 for (SDNode *User : Src->users())
42328 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
42329 Src == User->getOperand(0) &&
42330 User->getValueSizeInBits(0).getFixedValue() >
42331 VT.getFixedSizeInBits()) {
42332 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
42333 VT.getSizeInBits());
42334 }
42335
42336 // vbroadcast(scalarload X) -> vbroadcast_load X
42337 // For float loads, extract other uses of the scalar from the broadcast.
42338 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
42339 ISD::isNormalLoad(Src.getNode())) {
42340 LoadSDNode *LN = cast<LoadSDNode>(Src);
42341 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42342 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42343 SDValue BcastLd =
42345 LN->getMemoryVT(), LN->getMemOperand());
42346 // If the load value is used only by N, replace it via CombineTo N.
42347 bool NoReplaceExtract = Src.hasOneUse();
42348 DCI.CombineTo(N.getNode(), BcastLd);
42349 if (NoReplaceExtract) {
42350 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42352 } else {
42353 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
42354 DAG.getVectorIdxConstant(0, DL));
42355 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
42356 }
42357 return N; // Return N so it doesn't get rechecked!
42358 }
42359
42360 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
42361 // i16. So shrink it ourselves if we can make a broadcast_load.
42362 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
42363 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
42364 assert(Subtarget.hasAVX2() && "Expected AVX2");
42365 SDValue TruncIn = Src.getOperand(0);
42366
42367 // If this is a truncate of a non extending load we can just narrow it to
42368 // use a broadcast_load.
42369 if (ISD::isNormalLoad(TruncIn.getNode())) {
42370 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
42371 // Unless its volatile or atomic.
42372 if (LN->isSimple()) {
42373 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42374 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42375 SDValue BcastLd = DAG.getMemIntrinsicNode(
42376 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42377 LN->getPointerInfo(), LN->getBaseAlign(),
42378 LN->getMemOperand()->getFlags());
42379 DCI.CombineTo(N.getNode(), BcastLd);
42380 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42381 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42382 return N; // Return N so it doesn't get rechecked!
42383 }
42384 }
42385
42386 // If this is a truncate of an i16 extload, we can directly replace it.
42387 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
42388 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
42389 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
42390 if (LN->getMemoryVT().getSizeInBits() == 16) {
42391 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42392 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42393 SDValue BcastLd =
42395 LN->getMemoryVT(), LN->getMemOperand());
42396 DCI.CombineTo(N.getNode(), BcastLd);
42397 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42398 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42399 return N; // Return N so it doesn't get rechecked!
42400 }
42401 }
42402
42403 // If this is a truncate of load that has been shifted right, we can
42404 // offset the pointer and use a narrower load.
42405 if (TruncIn.getOpcode() == ISD::SRL &&
42406 TruncIn.getOperand(0).hasOneUse() &&
42407 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
42408 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
42409 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
42410 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
42411 // Make sure the shift amount and the load size are divisible by 16.
42412 // Don't do this if the load is volatile or atomic.
42413 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
42414 LN->isSimple()) {
42415 unsigned Offset = ShiftAmt / 8;
42416 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42419 SDValue Ops[] = { LN->getChain(), Ptr };
42420 SDValue BcastLd = DAG.getMemIntrinsicNode(
42421 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42423 LN->getMemOperand()->getFlags());
42424 DCI.CombineTo(N.getNode(), BcastLd);
42425 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42426 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42427 return N; // Return N so it doesn't get rechecked!
42428 }
42429 }
42430 }
42431
42432 // vbroadcast(vzload X) -> vbroadcast_load X
42433 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
42435 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
42436 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42437 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42438 SDValue BcastLd =
42440 LN->getMemoryVT(), LN->getMemOperand());
42441 DCI.CombineTo(N.getNode(), BcastLd);
42442 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42444 return N; // Return N so it doesn't get rechecked!
42445 }
42446 }
42447
42448 // vbroadcast(vector load X) -> vbroadcast_load
42449 if (Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
42450 LoadSDNode *LN = cast<LoadSDNode>(Src);
42451 // Unless the load is volatile or atomic.
42452 if (LN->isSimple()) {
42453 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42454 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42455 SDValue BcastLd = DAG.getMemIntrinsicNode(
42457 LN->getPointerInfo(), LN->getBaseAlign(),
42458 LN->getMemOperand()->getFlags());
42459 DCI.CombineTo(N.getNode(), BcastLd);
42460 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42462 return N; // Return N so it doesn't get rechecked!
42463 }
42464 }
42465
42466 return SDValue();
42467 }
42468 case X86ISD::VZEXT_MOVL: {
42469 SDValue N0 = N.getOperand(0);
42470
42471 // Fold (vzmovl (shift x, y)) -> (shift (vzmovl x), y)
42472 // Zeroing out the upper elements means we're just shifting a zero value.
42473 // TODO: Try harder to move vzmovl upward towards SCALAR_TO_VECTOR nodes.
42474 // TODO: Move this to canonicalizeShuffleWithOp once we add zero handling.
42475 if (N0.getOpcode() == X86ISD::VSHL || N0.getOpcode() == X86ISD::VSHLI ||
42476 N0.getOpcode() == X86ISD::VSRL || N0.getOpcode() == X86ISD::VSRLI ||
42477 N0.getOpcode() == X86ISD::VSRA || N0.getOpcode() == X86ISD::VSRAI) {
42478 if (N0.hasOneUse())
42479 return DAG.getNode(
42480 N0.getOpcode(), DL, VT,
42481 DAG.getNode(X86ISD::VZEXT_MOVL, DL, VT, N0.getOperand(0)),
42482 N0.getOperand(1));
42483 }
42484
42485 // If this a vzmovl of a full vector load, replace it with a vzload, unless
42486 // the load is volatile.
42487 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
42488 auto *LN = cast<LoadSDNode>(N0);
42489 if (SDValue VZLoad =
42490 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
42491 DCI.CombineTo(N.getNode(), VZLoad);
42492 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42494 return N;
42495 }
42496 }
42497
42498 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
42499 // and can just use a VZEXT_LOAD.
42500 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
42501 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
42502 auto *LN = cast<MemSDNode>(N0);
42503 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
42504 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42505 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42506 SDValue VZLoad =
42508 LN->getMemoryVT(), LN->getMemOperand());
42509 DCI.CombineTo(N.getNode(), VZLoad);
42510 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42512 return N;
42513 }
42514 }
42515
42516 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
42517 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
42518 // if the upper bits of the i64 are zero.
42519 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42520 N0.getOperand(0).hasOneUse() &&
42521 N0.getOperand(0).getValueType() == MVT::i64) {
42522 SDValue In = N0.getOperand(0);
42523 APInt Mask = APInt::getHighBitsSet(64, 32);
42524 if (DAG.MaskedValueIsZero(In, Mask)) {
42525 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
42526 MVT VecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
42527 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
42528 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
42529 return DAG.getBitcast(VT, Movl);
42530 }
42531 }
42532
42533 // Load a scalar integer constant directly to XMM instead of transferring an
42534 // immediate value from GPR.
42535 // vzext_movl (scalar_to_vector C) --> load [C,0...]
42536 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
42537 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
42538 // Create a vector constant - scalar constant followed by zeros.
42539 EVT ScalarVT = N0.getOperand(0).getValueType();
42540 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
42541 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
42542 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
42543 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
42544
42545 // Load the vector constant from constant pool.
42546 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
42547 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
42548 MachinePointerInfo MPI =
42550 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
42551 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
42553 }
42554 }
42555
42556 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
42557 // insert into a zero vector. This helps get VZEXT_MOVL closer to
42558 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
42559 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
42560 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
42562
42563 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
42564 isNullConstant(V.getOperand(2))) {
42565 SDValue In = V.getOperand(1);
42567 In.getValueSizeInBits() /
42568 VT.getScalarSizeInBits());
42569 In = DAG.getBitcast(SubVT, In);
42570 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
42571 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
42572 getZeroVector(VT, Subtarget, DAG, DL), Movl,
42573 V.getOperand(2));
42574 }
42575 }
42576
42577 return SDValue();
42578 }
42579 case X86ISD::BLENDI: {
42580 SDValue N0 = N.getOperand(0);
42581 SDValue N1 = N.getOperand(1);
42582 unsigned EltBits = VT.getScalarSizeInBits();
42583
42584 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
42585 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
42586 // TODO: Handle MVT::v16i16 repeated blend mask.
42587 if (N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
42588 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
42589 unsigned SrcBits = SrcVT.getScalarSizeInBits();
42590 if ((EltBits % SrcBits) == 0 && SrcBits >= 32) {
42591 unsigned NewSize = SrcVT.getVectorNumElements();
42592 APInt BlendMask = getBLENDIBlendMask(N);
42593 APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
42594 return DAG.getBitcast(
42595 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
42596 N1.getOperand(0),
42597 DAG.getTargetConstant(NewBlendMask.getZExtValue(),
42598 DL, MVT::i8)));
42599 }
42600 }
42601 // Share PSHUFB masks:
42602 // blend(pshufb(x,m1),pshufb(y,m2))
42603 // --> m3 = blend(m1,m2)
42604 // blend(pshufb(x,m3),pshufb(y,m3))
42605 if (N0.hasOneUse() && N1.hasOneUse()) {
42606 SmallVector<int> Mask, ByteMask;
42610 if (LHS.getOpcode() == X86ISD::PSHUFB &&
42611 RHS.getOpcode() == X86ISD::PSHUFB &&
42612 LHS.getOperand(1) != RHS.getOperand(1) &&
42613 LHS.getOperand(1).hasOneUse() && RHS.getOperand(1).hasOneUse() &&
42614 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
42615 assert(Ops.size() == 2 && LHS == peekThroughOneUseBitcasts(Ops[0]) &&
42617 "BLENDI decode mismatch");
42618 MVT ShufVT = LHS.getSimpleValueType();
42619 SDValue MaskLHS = LHS.getOperand(1);
42620 SDValue MaskRHS = RHS.getOperand(1);
42621 llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask);
42623 ShufVT, {MaskLHS, MaskRHS}, ByteMask,
42624 {LHS.getNode(), RHS.getNode()}, DAG, DL, Subtarget)) {
42625 SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42626 LHS.getOperand(0), NewMask);
42627 SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42628 RHS.getOperand(0), NewMask);
42629 return DAG.getNode(X86ISD::BLENDI, DL, VT,
42630 DAG.getBitcast(VT, NewLHS),
42631 DAG.getBitcast(VT, NewRHS), N.getOperand(2));
42632 }
42633 }
42634 }
42635 }
42636 return SDValue();
42637 }
42638 case X86ISD::SHUFP: {
42639 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
42640 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
42641 // TODO: Support types other than v4f32.
42642 if (VT == MVT::v4f32) {
42643 bool Updated = false;
42644 SmallVector<int> Mask;
42646 if (getTargetShuffleMask(N, false, Ops, Mask) && Ops.size() == 2) {
42647 for (int i = 0; i != 2; ++i) {
42648 SmallVector<SDValue> SubOps;
42649 SmallVector<int> SubMask, SubScaledMask;
42651 // TODO: Scaling might be easier if we specify the demanded elts.
42652 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
42653 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
42654 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
42655 int Ofs = i * 2;
42656 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
42657 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
42658 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
42659 Updated = true;
42660 }
42661 }
42662 }
42663 if (Updated) {
42664 for (int &M : Mask)
42665 M %= 4;
42666 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
42667 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
42668 }
42669 }
42670 return SDValue();
42671 }
42672 case X86ISD::VPERMI: {
42673 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
42674 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
42675 SDValue N0 = N.getOperand(0);
42676 SDValue N1 = N.getOperand(1);
42677 unsigned EltSizeInBits = VT.getScalarSizeInBits();
42678 if (N0.getOpcode() == ISD::BITCAST &&
42679 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
42680 SDValue Src = N0.getOperand(0);
42681 EVT SrcVT = Src.getValueType();
42682 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
42683 return DAG.getBitcast(VT, Res);
42684 }
42685 return SDValue();
42686 }
42687 case X86ISD::SHUF128: {
42688 // If we're permuting the upper 256-bits subvectors of a concatenation, then
42689 // see if we can peek through and access the subvector directly.
42690 if (VT.is512BitVector()) {
42691 // 512-bit mask uses 4 x i2 indices - if the msb is always set then only
42692 // the upper subvector is used.
42693 SDValue LHS = peekThroughBitcasts(N->getOperand(0));
42694 SDValue RHS = peekThroughBitcasts(N->getOperand(1));
42695 uint64_t Mask = N->getConstantOperandVal(2);
42696 SmallVector<SDValue> LHSOps, RHSOps;
42697 SDValue NewLHS, NewRHS;
42698 if ((Mask & 0x0A) == 0x0A &&
42699 collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
42700 NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
42701 Mask &= ~0x0A;
42702 }
42703 if ((Mask & 0xA0) == 0xA0 &&
42704 collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
42705 NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
42706 Mask &= ~0xA0;
42707 }
42708 if (NewLHS || NewRHS)
42709 return DAG.getNode(X86ISD::SHUF128, DL, VT,
42710 DAG.getBitcast(VT, NewLHS ? NewLHS : LHS),
42711 DAG.getBitcast(VT, NewRHS ? NewRHS : RHS),
42712 DAG.getTargetConstant(Mask, DL, MVT::i8));
42713 }
42714 return SDValue();
42715 }
42716 case X86ISD::VPERM2X128: {
42717 SDValue LHS = N->getOperand(0);
42718 SDValue RHS = N->getOperand(1);
42719 unsigned Imm = N.getConstantOperandVal(2) & 255;
42720
42721 // Canonicalize unary/repeated operands to LHS.
42722 if (LHS.isUndef() && !RHS.isUndef())
42723 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, RHS, LHS,
42724 DAG.getTargetConstant(Imm ^ 0x22, DL, MVT::i8));
42725 if (LHS == RHS)
42726 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, DAG.getUNDEF(VT),
42727 DAG.getTargetConstant(Imm & ~0x22, DL, MVT::i8));
42728
42729 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
42730 if (LHS.getOpcode() == ISD::BITCAST &&
42731 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
42732 EVT SrcVT = LHS.getOperand(0).getValueType();
42733 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
42734 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
42735 DAG.getBitcast(SrcVT, LHS),
42736 DAG.getBitcast(SrcVT, RHS),
42737 N->getOperand(2)));
42738 }
42739 }
42740
42741 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
42743 return Res;
42744
42745 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
42746 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
42747 auto FindSubVector128 = [&](unsigned Idx) {
42748 if (Idx > 3)
42749 return SDValue();
42750 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
42751 SmallVector<SDValue> SubOps;
42752 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
42753 return SubOps[Idx & 1];
42754 unsigned NumElts = Src.getValueType().getVectorNumElements();
42755 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
42756 Src.getOperand(1).getValueSizeInBits() == 128 &&
42757 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
42758 return Src.getOperand(1);
42759 }
42760 return SDValue();
42761 };
42762 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
42763 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
42764 MVT SubVT = VT.getHalfNumVectorElementsVT();
42765 SubLo = DAG.getBitcast(SubVT, SubLo);
42766 SubHi = DAG.getBitcast(SubVT, SubHi);
42767 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
42768 }
42769 }
42770
42771 // Attempt to match VBROADCAST*128 subvector broadcast load.
42772 if (RHS.isUndef()) {
42774 DecodeVPERM2X128Mask(4, Imm, Mask);
42775 if (isUndefOrInRange(Mask, 0, 4)) {
42776 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, LHS);
42777 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, LHS);
42778 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() &&
42779 X86::mayFoldLoad(LHS, Subtarget, /*AssumeSingleUse=*/true)) {
42780 MVT MemVT = VT.getHalfNumVectorElementsVT();
42781 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
42783 cast<LoadSDNode>(LHS), Ofs, DAG);
42784 }
42785 }
42786 }
42787
42788 return SDValue();
42789 }
42790 case X86ISD::PSHUFD:
42791 case X86ISD::PSHUFLW:
42792 case X86ISD::PSHUFHW: {
42793 SDValue N0 = N.getOperand(0);
42794 SDValue N1 = N.getOperand(1);
42795 if (N0->hasOneUse()) {
42797 switch (V.getOpcode()) {
42798 case X86ISD::VSHL:
42799 case X86ISD::VSRL:
42800 case X86ISD::VSRA:
42801 case X86ISD::VSHLI:
42802 case X86ISD::VSRLI:
42803 case X86ISD::VSRAI:
42804 case X86ISD::VROTLI:
42805 case X86ISD::VROTRI: {
42806 MVT InnerVT = V.getSimpleValueType();
42807 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
42808 SDValue Res = DAG.getNode(Opcode, DL, VT,
42809 DAG.getBitcast(VT, V.getOperand(0)), N1);
42810 Res = DAG.getBitcast(InnerVT, Res);
42811 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
42812 return DAG.getBitcast(VT, Res);
42813 }
42814 break;
42815 }
42816 }
42817 }
42818
42819 Mask = getPSHUFShuffleMask(N);
42820 assert(Mask.size() == 4);
42821 break;
42822 }
42823 case X86ISD::MOVSD:
42824 case X86ISD::MOVSH:
42825 case X86ISD::MOVSS: {
42826 SDValue N0 = N.getOperand(0);
42827 SDValue N1 = N.getOperand(1);
42828
42829 // Canonicalize scalar FPOps:
42830 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
42831 // If commutable, allow OP(N1[0], N0[0]).
42832 unsigned Opcode1 = N1.getOpcode();
42833 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
42834 Opcode1 == ISD::FDIV) {
42835 SDValue N10 = N1.getOperand(0);
42836 SDValue N11 = N1.getOperand(1);
42837 if (N10 == N0 ||
42838 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
42839 if (N10 != N0)
42840 std::swap(N10, N11);
42841 MVT SVT = VT.getVectorElementType();
42842 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
42843 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
42844 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
42845 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
42846 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
42847 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
42848 }
42849 }
42850
42851 return SDValue();
42852 }
42853 case X86ISD::INSERTPS: {
42854 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
42855 SDValue Op0 = N.getOperand(0);
42856 SDValue Op1 = N.getOperand(1);
42857 unsigned InsertPSMask = N.getConstantOperandVal(2);
42858 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
42859 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
42860 unsigned ZeroMask = InsertPSMask & 0xF;
42861
42862 // If we zero out all elements from Op0 then we don't need to reference it.
42863 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
42864 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
42865 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42866
42867 // If we zero out the element from Op1 then we don't need to reference it.
42868 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
42869 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42870 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42871
42872 // Attempt to merge insertps Op1 with an inner target shuffle node.
42873 SmallVector<int, 8> TargetMask1;
42875 APInt KnownUndef1, KnownZero1;
42876 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
42877 KnownZero1)) {
42878 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
42879 // Zero/UNDEF insertion - zero out element and remove dependency.
42880 InsertPSMask |= (1u << DstIdx);
42881 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42882 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42883 }
42884 // Update insertps mask srcidx and reference the source input directly.
42885 int M = TargetMask1[SrcIdx];
42886 assert(0 <= M && M < 8 && "Shuffle index out of range");
42887 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
42888 Op1 = Ops1[M < 4 ? 0 : 1];
42889 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42890 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42891 }
42892
42893 // Attempt to merge insertps Op0 with an inner target shuffle node.
42894 SmallVector<int, 8> TargetMask0;
42896 APInt KnownUndef0, KnownZero0;
42897 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
42898 KnownZero0)) {
42899 bool Updated = false;
42900 bool UseInput00 = false;
42901 bool UseInput01 = false;
42902 for (int i = 0; i != 4; ++i) {
42903 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
42904 // No change if element is already zero or the inserted element.
42905 continue;
42906 }
42907
42908 if (KnownUndef0[i] || KnownZero0[i]) {
42909 // If the target mask is undef/zero then we must zero the element.
42910 InsertPSMask |= (1u << i);
42911 Updated = true;
42912 continue;
42913 }
42914
42915 // The input vector element must be inline.
42916 int M = TargetMask0[i];
42917 if (M != i && M != (i + 4))
42918 return SDValue();
42919
42920 // Determine which inputs of the target shuffle we're using.
42921 UseInput00 |= (0 <= M && M < 4);
42922 UseInput01 |= (4 <= M);
42923 }
42924
42925 // If we're not using both inputs of the target shuffle then use the
42926 // referenced input directly.
42927 if (UseInput00 && !UseInput01) {
42928 Updated = true;
42929 Op0 = Ops0[0];
42930 } else if (!UseInput00 && UseInput01) {
42931 Updated = true;
42932 Op0 = Ops0[1];
42933 }
42934
42935 if (Updated)
42936 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42937 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42938 }
42939
42940 // If we're inserting an element from a vbroadcast load, fold the
42941 // load into the X86insertps instruction. We need to convert the scalar
42942 // load to a vector and clear the source lane of the INSERTPS control.
42943 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
42944 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
42945 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
42946 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
42947 MemIntr->getBasePtr(),
42948 MemIntr->getMemOperand());
42949 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
42951 Load),
42952 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
42953 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
42954 return Insert;
42955 }
42956 }
42957
42958 return SDValue();
42959 }
42960 case X86ISD::VPERMV: {
42961 // Combine VPERMV to VPERMV3 if the source operand can be freely split.
42963 SmallVector<SDValue, 2> SrcOps, SubOps;
42964 SDValue Src = peekThroughBitcasts(N.getOperand(1));
42965 if ((Subtarget.hasVLX() || VT.is512BitVector()) &&
42966 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask) &&
42967 collectConcatOps(Src.getNode(), SubOps, DAG)) {
42968 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
42969 assert(SrcOps.size() == 1 && "Unexpected shuffle ops");
42970 assert((SubOps.size() == 2 || SubOps.size() == 4) &&
42971 "Unexpected split ops");
42972 // Bail if we were permuting a widened vector.
42973 if (SubOps[1].isUndef() &&
42974 (SubOps.size() == 2 || (SubOps[2].isUndef() && SubOps[3].isUndef())))
42975 return SDValue();
42976 // Bail if any subops would have folded into the concat.
42977 if (any_of(SubOps, isShuffleFoldableLoad))
42978 return SDValue();
42979 // Concat 4x128 back to 2x256.
42980 if (SubOps.size() == 4) {
42981 SubOps[0] = concatSubVectors(SubOps[0], SubOps[1], DAG, DL);
42982 SubOps[1] = concatSubVectors(SubOps[2], SubOps[3], DAG, DL);
42983 }
42984 // Convert mask to 2 operand shuffle.
42985 int HalfElts = NumElts / 2;
42986 for (int &M : Mask)
42987 M += M >= HalfElts ? HalfElts : 0;
42988 SDValue Lo = widenSubVector(SubOps[0], false, Subtarget, DAG, DL,
42989 VT.getSizeInBits());
42990 SDValue Hi = widenSubVector(SubOps[1], false, Subtarget, DAG, DL,
42991 VT.getSizeInBits());
42992 return lowerShuffleWithPERMV(DL, VT, Mask, DAG.getBitcast(VT, Lo),
42993 DAG.getBitcast(VT, Hi), Subtarget, DAG);
42994 }
42995 return SDValue();
42996 }
42997 case X86ISD::VPERMV3: {
42998 MVT WideVT = VT.getDoubleNumVectorElementsVT();
42999 bool CanConcat = VT.is128BitVector() ||
43000 (VT.is256BitVector() && Subtarget.useAVX512Regs());
43003 if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask)) {
43004 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
43005 SDValue V1 = peekThroughBitcasts(N.getOperand(0));
43006 SDValue V2 = peekThroughBitcasts(N.getOperand(2));
43007 // Canonicalize to VPERMV if both sources are the same.
43008 if (V1 == V2) {
43009 for (int &M : Mask)
43010 M = (M < 0 ? M : (M & (NumElts - 1)));
43011 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(0),
43012 DAG.getUNDEF(VT), Subtarget, DAG);
43013 }
43014 // If sources are half width, then concat and use VPERMV with adjusted
43015 // mask.
43016 SDValue Ops[2];
43017 MVT HalfVT = VT.getHalfNumVectorElementsVT();
43018 if (sd_match(V1,
43020 sd_match(V2,
43022 Ops[0].getValueType() == HalfVT && Ops[1].getValueType() == HalfVT) {
43023 if (SDValue ConcatSrc =
43024 combineConcatVectorOps(DL, VT, Ops, DAG, Subtarget)) {
43025 for (int &M : Mask)
43026 M = (M < (int)NumElts ? M : (M - (NumElts / 2)));
43027 return lowerShuffleWithPERMV(DL, VT, Mask, ConcatSrc,
43028 DAG.getUNDEF(VT), Subtarget, DAG);
43029 }
43030 }
43031 // Commute foldable source to the RHS.
43032 if (isShuffleFoldableLoad(N.getOperand(0)) &&
43033 !isShuffleFoldableLoad(N.getOperand(2))) {
43035 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(2),
43036 N.getOperand(0), Subtarget, DAG);
43037 }
43038 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43039 // freely concatenated, with a commuted shuffle mask.
43040 if (CanConcat) {
43041 if (SDValue ConcatSrc = combineConcatVectorOps(
43042 DL, WideVT, {N.getOperand(2), N.getOperand(0)}, DAG,
43043 Subtarget)) {
43045 Mask.append(NumElts, SM_SentinelUndef);
43046 SDValue Perm =
43047 lowerShuffleWithPERMV(DL, WideVT, Mask, ConcatSrc,
43048 DAG.getUNDEF(WideVT), Subtarget, DAG);
43049 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43050 DAG.getVectorIdxConstant(0, DL));
43051 }
43052 }
43053 }
43054 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43055 // freely concatenated.
43056 if (CanConcat) {
43057 if (SDValue ConcatSrc = combineConcatVectorOps(
43058 DL, WideVT, {N.getOperand(0), N.getOperand(2)}, DAG, Subtarget)) {
43059 SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
43060 DL, WideVT.getSizeInBits());
43061 SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc);
43062 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43063 DAG.getVectorIdxConstant(0, DL));
43064 }
43065 }
43066 return SDValue();
43067 }
43068 default:
43069 return SDValue();
43070 }
43071
43072 // Nuke no-op shuffles that show up after combining.
43073 if (isNoopShuffleMask(Mask))
43074 return N.getOperand(0);
43075
43076 // Look for simplifications involving one or two shuffle instructions.
43077 SDValue V = N.getOperand(0);
43078 switch (N.getOpcode()) {
43079 default:
43080 break;
43081 case X86ISD::PSHUFLW:
43082 case X86ISD::PSHUFHW:
43083 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
43084
43085 // See if this reduces to a PSHUFD which is no more expensive and can
43086 // combine with more operations. Note that it has to at least flip the
43087 // dwords as otherwise it would have been removed as a no-op.
43088 if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {
43089 int DMask[] = {0, 1, 2, 3};
43090 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
43091 DMask[DOffset + 0] = DOffset + 1;
43092 DMask[DOffset + 1] = DOffset + 0;
43093 MVT DVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
43094 V = DAG.getBitcast(DVT, V);
43095 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
43096 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
43097 return DAG.getBitcast(VT, V);
43098 }
43099
43100 // Look for shuffle patterns which can be implemented as a single unpack.
43101 // FIXME: This doesn't handle the location of the PSHUFD generically, and
43102 // only works when we have a PSHUFD followed by two half-shuffles.
43103 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
43104 (V.getOpcode() == X86ISD::PSHUFLW ||
43105 V.getOpcode() == X86ISD::PSHUFHW) &&
43106 V.getOpcode() != N.getOpcode() &&
43107 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
43108 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
43109 if (D.getOpcode() == X86ISD::PSHUFD) {
43112 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43113 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43114 int WordMask[8];
43115 for (int i = 0; i < 4; ++i) {
43116 WordMask[i + NOffset] = Mask[i] + NOffset;
43117 WordMask[i + VOffset] = VMask[i] + VOffset;
43118 }
43119 // Map the word mask through the DWord mask.
43120 int MappedMask[8];
43121 for (int i = 0; i < 8; ++i)
43122 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
43123 if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
43124 ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
43125 // We can replace all three shuffles with an unpack.
43126 V = DAG.getBitcast(VT, D.getOperand(0));
43127 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
43129 DL, VT, V, V);
43130 }
43131 }
43132 }
43133
43134 break;
43135
43136 case X86ISD::PSHUFD:
43137 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DL, DAG))
43138 return NewN;
43139
43140 break;
43141 }
43142
43143 return SDValue();
43144}
43145
43146/// Checks if the shuffle mask takes subsequent elements
43147/// alternately from two vectors.
43148/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
43149static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
43150
43151 int ParitySrc[2] = {-1, -1};
43152 unsigned Size = Mask.size();
43153 for (unsigned i = 0; i != Size; ++i) {
43154 int M = Mask[i];
43155 if (M < 0)
43156 continue;
43157
43158 // Make sure we are using the matching element from the input.
43159 if ((M % Size) != i)
43160 return false;
43161
43162 // Make sure we use the same input for all elements of the same parity.
43163 int Src = M / Size;
43164 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
43165 return false;
43166 ParitySrc[i % 2] = Src;
43167 }
43168
43169 // Make sure each input is used.
43170 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
43171 return false;
43172
43173 Op0Even = ParitySrc[0] == 0;
43174 return true;
43175}
43176
43177/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
43178/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
43179/// are written to the parameters \p Opnd0 and \p Opnd1.
43180///
43181/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
43182/// so it is easier to generically match. We also insert dummy vector shuffle
43183/// nodes for the operands which explicitly discard the lanes which are unused
43184/// by this operation to try to flow through the rest of the combiner
43185/// the fact that they're unused.
43186static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
43187 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
43188 bool &IsSubAdd, bool &HasAllowContract) {
43189
43190 EVT VT = N->getValueType(0);
43191 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43192 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
43194 return false;
43195
43196 // We only handle target-independent shuffles.
43197 // FIXME: It would be easy and harmless to use the target shuffle mask
43198 // extraction tool to support more.
43199 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43200 return false;
43201
43202 SDValue V1 = N->getOperand(0);
43203 SDValue V2 = N->getOperand(1);
43204
43205 // Make sure we have an FADD and an FSUB.
43206 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
43207 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
43208 V1.getOpcode() == V2.getOpcode())
43209 return false;
43210
43211 // If there are other uses of these operations we can't fold them.
43212 if (!V1->hasOneUse() || !V2->hasOneUse())
43213 return false;
43214
43215 // Ensure that both operations have the same operands. Note that we can
43216 // commute the FADD operands.
43217 SDValue LHS, RHS;
43218 if (V1.getOpcode() == ISD::FSUB) {
43219 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
43220 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
43221 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
43222 return false;
43223 } else {
43224 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
43225 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
43226 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
43227 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
43228 return false;
43229 }
43230
43231 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43232 bool Op0Even;
43233 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43234 return false;
43235
43236 // It's a subadd if the vector in the even parity is an FADD.
43237 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
43238 : V2->getOpcode() == ISD::FADD;
43239 HasAllowContract =
43241
43242 Opnd0 = LHS;
43243 Opnd1 = RHS;
43244 return true;
43245}
43246
43247/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
43249 const X86Subtarget &Subtarget,
43250 SelectionDAG &DAG) {
43251 // We only handle target-independent shuffles.
43252 // FIXME: It would be easy and harmless to use the target shuffle mask
43253 // extraction tool to support more.
43254 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43255 return SDValue();
43256
43257 MVT VT = N->getSimpleValueType(0);
43258 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43259 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
43260 return SDValue();
43261
43262 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
43263 SDValue Op0 = N->getOperand(0);
43264 SDValue Op1 = N->getOperand(1);
43265 SDValue FMAdd = Op0, FMSub = Op1;
43266 if (FMSub.getOpcode() != X86ISD::FMSUB)
43267 std::swap(FMAdd, FMSub);
43268
43269 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
43270 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
43271 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
43272 FMAdd.getOperand(2) != FMSub.getOperand(2))
43273 return SDValue();
43274
43275 // Check for correct shuffle mask.
43276 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43277 bool Op0Even;
43278 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43279 return SDValue();
43280
43281 // FMAddSub takes zeroth operand from FMSub node.
43282 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
43283 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43284 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
43285 FMAdd.getOperand(2));
43286}
43287
43288/// Try to combine a shuffle into a target-specific add-sub or
43289/// mul-add-sub node.
43291 const X86Subtarget &Subtarget,
43292 SelectionDAG &DAG) {
43293 if (SDValue V = combineShuffleToFMAddSub(N, DL, Subtarget, DAG))
43294 return V;
43295
43296 SDValue Opnd0, Opnd1;
43297 bool IsSubAdd;
43298 bool HasAllowContract;
43299 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd,
43300 HasAllowContract))
43301 return SDValue();
43302
43303 MVT VT = N->getSimpleValueType(0);
43304
43305 // Try to generate X86ISD::FMADDSUB node here.
43306 SDValue Opnd2;
43307 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2,
43308 HasAllowContract)) {
43309 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43310 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
43311 }
43312
43313 if (IsSubAdd)
43314 return SDValue();
43315
43316 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
43317 // the ADDSUB idiom has been successfully recognized. There are no known
43318 // X86 targets with 512-bit ADDSUB instructions!
43319 if (VT.is512BitVector())
43320 return SDValue();
43321
43322 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
43323 // the ADDSUB idiom has been successfully recognized. There are no known
43324 // X86 targets with FP16 ADDSUB instructions!
43325 if (VT.getVectorElementType() == MVT::f16)
43326 return SDValue();
43327
43328 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
43329}
43330
43331/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
43332/// low half of each source vector and does not set any high half elements in
43333/// the destination vector, narrow the shuffle to half its original size.
43335 EVT VT = Shuf->getValueType(0);
43336 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
43337 return SDValue();
43338 if (!VT.is256BitVector() && !VT.is512BitVector())
43339 return SDValue();
43340
43341 // See if we can ignore all of the high elements of the shuffle.
43342 ArrayRef<int> Mask = Shuf->getMask();
43343 if (!isUndefUpperHalf(Mask))
43344 return SDValue();
43345
43346 // Check if the shuffle mask accesses only the low half of each input vector
43347 // (half-index output is 0 or 2).
43348 int HalfIdx1, HalfIdx2;
43349 SmallVector<int, 8> HalfMask(Mask.size() / 2);
43350 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
43351 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
43352 return SDValue();
43353
43354 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
43355 // The trick is knowing that all of the insert/extract are actually free
43356 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
43357 // of narrow inputs into a narrow output, and that is always cheaper than
43358 // the wide shuffle that we started with.
43359 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
43360 Shuf->getOperand(1), HalfMask, HalfIdx1,
43361 HalfIdx2, false, DAG, /*UseConcat*/ true);
43362}
43363
43366 const X86Subtarget &Subtarget) {
43367 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
43368 if (SDValue V = narrowShuffle(Shuf, DAG))
43369 return V;
43370
43371 // If we have legalized the vector types, look for blends of FADD and FSUB
43372 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
43373 SDLoc dl(N);
43374 EVT VT = N->getValueType(0);
43375 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43376 if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))
43377 if (SDValue AddSub =
43378 combineShuffleToAddSubOrFMAddSub(N, dl, Subtarget, DAG))
43379 return AddSub;
43380
43381 // Attempt to combine into a vector load/broadcast.
43383 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
43384 return LD;
43385
43386 if (isTargetShuffle(N->getOpcode())) {
43387 SDValue Op(N, 0);
43388 if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
43389 return Shuffle;
43390
43391 // Try recursively combining arbitrary sequences of x86 shuffle
43392 // instructions into higher-order shuffles. We do this after combining
43393 // specific PSHUF instruction sequences into their minimal form so that we
43394 // can evaluate how many specialized shuffle instructions are involved in
43395 // a particular chain.
43396 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
43397 return Res;
43398
43399 // Simplify source operands based on shuffle mask.
43400 // TODO - merge this into combineX86ShufflesRecursively.
43401 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
43402 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
43403 return SDValue(N, 0);
43404
43405 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
43406 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
43407 // Perform this after other shuffle combines to allow inner shuffles to be
43408 // combined away first.
43409 if (SDValue BinOp = canonicalizeShuffleWithOp(Op, DAG, dl))
43410 return BinOp;
43411 }
43412
43413 return SDValue();
43414}
43415
43416// Simplify variable target shuffle masks based on the demanded elements.
43417// TODO: Handle DemandedBits in mask indices as well?
43419 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
43420 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
43421 // If we're demanding all elements don't bother trying to simplify the mask.
43422 unsigned NumElts = DemandedElts.getBitWidth();
43423 if (DemandedElts.isAllOnes())
43424 return false;
43425
43426 SDValue Mask = Op.getOperand(MaskIndex);
43427 if (!Mask.hasOneUse())
43428 return false;
43429
43430 // Attempt to generically simplify the variable shuffle mask.
43431 APInt MaskUndef, MaskZero;
43432 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
43433 Depth + 1))
43434 return true;
43435
43436 // Attempt to extract+simplify a (constant pool load) shuffle mask.
43437 // TODO: Support other types from getTargetShuffleMaskIndices?
43439 EVT BCVT = BC.getValueType();
43440 auto *Load = dyn_cast<LoadSDNode>(BC);
43441 if (!Load || !Load->getBasePtr().hasOneUse())
43442 return false;
43443
43444 const Constant *C = getTargetConstantFromNode(Load);
43445 if (!C)
43446 return false;
43447
43448 Type *CTy = C->getType();
43449 if (!CTy->isVectorTy() ||
43450 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
43451 return false;
43452
43453 // Handle scaling for i64 elements on 32-bit targets.
43454 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
43455 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
43456 return false;
43457 unsigned Scale = NumCstElts / NumElts;
43458
43459 // Simplify mask if we have an undemanded element that is not undef.
43460 bool Simplified = false;
43461 SmallVector<Constant *, 32> ConstVecOps;
43462 for (unsigned i = 0; i != NumCstElts; ++i) {
43463 Constant *Elt = C->getAggregateElement(i);
43464 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
43465 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
43466 Simplified = true;
43467 continue;
43468 }
43469 ConstVecOps.push_back(Elt);
43470 }
43471 if (!Simplified)
43472 return false;
43473
43474 // Generate new constant pool entry + legalize immediately for the load.
43475 SDLoc DL(Op);
43476 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
43477 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
43478 SDValue NewMask = TLO.DAG.getLoad(
43479 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
43481 Load->getAlign());
43482 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
43483}
43484
43486 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
43487 TargetLoweringOpt &TLO, unsigned Depth) const {
43488 int NumElts = DemandedElts.getBitWidth();
43489 unsigned Opc = Op.getOpcode();
43490 EVT VT = Op.getValueType();
43491
43492 // Handle special case opcodes.
43493 switch (Opc) {
43494 case X86ISD::PMULDQ:
43495 case X86ISD::PMULUDQ: {
43496 APInt LHSUndef, LHSZero;
43497 APInt RHSUndef, RHSZero;
43498 SDValue LHS = Op.getOperand(0);
43499 SDValue RHS = Op.getOperand(1);
43500 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43501 Depth + 1))
43502 return true;
43503 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43504 Depth + 1))
43505 return true;
43506 // Multiply by zero.
43507 KnownZero = LHSZero | RHSZero;
43508 break;
43509 }
43510 case X86ISD::VPMADDUBSW:
43511 case X86ISD::VPMADDWD: {
43512 APInt LHSUndef, LHSZero;
43513 APInt RHSUndef, RHSZero;
43514 SDValue LHS = Op.getOperand(0);
43515 SDValue RHS = Op.getOperand(1);
43516 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
43517
43518 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
43519 Depth + 1))
43520 return true;
43521 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
43522 Depth + 1))
43523 return true;
43524
43525 // TODO: Multiply by zero.
43526
43527 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
43528 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
43529 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
43530 Depth + 1))
43531 return true;
43532 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
43533 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
43534 Depth + 1))
43535 return true;
43536 break;
43537 }
43538 case X86ISD::PSADBW: {
43539 SDValue LHS = Op.getOperand(0);
43540 SDValue RHS = Op.getOperand(1);
43541 assert(VT.getScalarType() == MVT::i64 &&
43542 LHS.getValueType() == RHS.getValueType() &&
43543 LHS.getValueType().getScalarType() == MVT::i8 &&
43544 "Unexpected PSADBW types");
43545
43546 // Aggressively peek through ops to get at the demanded elts.
43547 if (!DemandedElts.isAllOnes()) {
43548 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
43549 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
43551 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43553 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43554 if (NewLHS || NewRHS) {
43555 NewLHS = NewLHS ? NewLHS : LHS;
43556 NewRHS = NewRHS ? NewRHS : RHS;
43557 return TLO.CombineTo(
43558 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43559 }
43560 }
43561 break;
43562 }
43563 case X86ISD::VSHL:
43564 case X86ISD::VSRL:
43565 case X86ISD::VSRA: {
43566 // We only need the bottom 64-bits of the (128-bit) shift amount.
43567 SDValue Amt = Op.getOperand(1);
43568 MVT AmtVT = Amt.getSimpleValueType();
43569 assert(AmtVT.is128BitVector() && "Unexpected value type");
43570
43571 // If we reuse the shift amount just for sse shift amounts then we know that
43572 // only the bottom 64-bits are only ever used.
43573 bool AssumeSingleUse = llvm::all_of(Amt->users(), [&Amt](SDNode *Use) {
43574 unsigned UseOpc = Use->getOpcode();
43575 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
43576 UseOpc == X86ISD::VSRA) &&
43577 Use->getOperand(0) != Amt;
43578 });
43579
43580 APInt AmtUndef, AmtZero;
43581 unsigned NumAmtElts = AmtVT.getVectorNumElements();
43582 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
43583 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
43584 Depth + 1, AssumeSingleUse))
43585 return true;
43586 [[fallthrough]];
43587 }
43588 case X86ISD::VSHLI:
43589 case X86ISD::VSRLI:
43590 case X86ISD::VSRAI: {
43591 SDValue Src = Op.getOperand(0);
43592 APInt SrcUndef;
43593 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
43594 Depth + 1))
43595 return true;
43596
43597 // Fold shift(0,x) -> 0
43598 if (DemandedElts.isSubsetOf(KnownZero))
43599 return TLO.CombineTo(
43600 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43601
43602 // Aggressively peek through ops to get at the demanded elts.
43603 if (!DemandedElts.isAllOnes())
43605 Src, DemandedElts, TLO.DAG, Depth + 1))
43606 return TLO.CombineTo(
43607 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
43608 break;
43609 }
43610 case X86ISD::VPSHA:
43611 case X86ISD::VPSHL:
43612 case X86ISD::VSHLV:
43613 case X86ISD::VSRLV:
43614 case X86ISD::VSRAV: {
43615 APInt LHSUndef, LHSZero;
43616 APInt RHSUndef, RHSZero;
43617 SDValue LHS = Op.getOperand(0);
43618 SDValue RHS = Op.getOperand(1);
43619 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43620 Depth + 1))
43621 return true;
43622
43623 // Fold shift(0,x) -> 0
43624 if (DemandedElts.isSubsetOf(LHSZero))
43625 return TLO.CombineTo(
43626 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43627
43628 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43629 Depth + 1))
43630 return true;
43631
43632 KnownZero = LHSZero;
43633 break;
43634 }
43635 case X86ISD::CMPM:
43636 case X86ISD::CMPP: {
43637 // Scalarize packed fp comparison if we only require element 0.
43638 if (DemandedElts == 1) {
43639 SDLoc dl(Op);
43640 MVT VT = Op.getSimpleValueType();
43641 MVT OpSVT = Op.getOperand(0).getSimpleValueType().getScalarType();
43642 SDValue LHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(0), 0);
43643 SDValue RHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(1), 0);
43644 SDValue CC = Op.getOperand(2);
43645 if (Opc == X86ISD::CMPM) {
43646 SDValue Cmp =
43647 TLO.DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, CC);
43648 return TLO.CombineTo(
43649 Op, TLO.DAG.getInsertSubvector(dl, TLO.DAG.getUNDEF(VT), Cmp, 0));
43650 }
43651 SDValue Cmp = TLO.DAG.getNode(X86ISD::FSETCC, dl, OpSVT, LHS, RHS, CC);
43652 return TLO.CombineTo(Op,
43653 TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Cmp));
43654 }
43655 break;
43656 }
43657 case X86ISD::PCMPEQ:
43658 case X86ISD::PCMPGT: {
43659 APInt LHSUndef, LHSZero;
43660 APInt RHSUndef, RHSZero;
43661 SDValue LHS = Op.getOperand(0);
43662 SDValue RHS = Op.getOperand(1);
43663 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43664 Depth + 1))
43665 return true;
43666 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43667 Depth + 1))
43668 return true;
43669 break;
43670 }
43671 case X86ISD::KSHIFTL: {
43672 SDValue Src = Op.getOperand(0);
43673 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43674 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43675 unsigned ShiftAmt = Amt->getZExtValue();
43676
43677 if (ShiftAmt == 0)
43678 return TLO.CombineTo(Op, Src);
43679
43680 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43681 // single shift. We can do this if the bottom bits (which are shifted
43682 // out) are never demanded.
43683 if (Src.getOpcode() == X86ISD::KSHIFTR) {
43684 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
43685 unsigned C1 = Src.getConstantOperandVal(1);
43686 unsigned NewOpc = X86ISD::KSHIFTL;
43687 int Diff = ShiftAmt - C1;
43688 if (Diff < 0) {
43689 Diff = -Diff;
43690 NewOpc = X86ISD::KSHIFTR;
43691 }
43692
43693 SDLoc dl(Op);
43694 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43695 return TLO.CombineTo(
43696 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43697 }
43698 }
43699
43700 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
43701 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43702 Depth + 1))
43703 return true;
43704
43705 KnownUndef <<= ShiftAmt;
43706 KnownZero <<= ShiftAmt;
43707 KnownZero.setLowBits(ShiftAmt);
43708 break;
43709 }
43710 case X86ISD::KSHIFTR: {
43711 SDValue Src = Op.getOperand(0);
43712 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43713 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43714 unsigned ShiftAmt = Amt->getZExtValue();
43715
43716 if (ShiftAmt == 0)
43717 return TLO.CombineTo(Op, Src);
43718
43719 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
43720 // single shift. We can do this if the top bits (which are shifted
43721 // out) are never demanded.
43722 if (Src.getOpcode() == X86ISD::KSHIFTL) {
43723 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
43724 unsigned C1 = Src.getConstantOperandVal(1);
43725 unsigned NewOpc = X86ISD::KSHIFTR;
43726 int Diff = ShiftAmt - C1;
43727 if (Diff < 0) {
43728 Diff = -Diff;
43729 NewOpc = X86ISD::KSHIFTL;
43730 }
43731
43732 SDLoc dl(Op);
43733 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43734 return TLO.CombineTo(
43735 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43736 }
43737 }
43738
43739 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
43740 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43741 Depth + 1))
43742 return true;
43743
43744 KnownUndef.lshrInPlace(ShiftAmt);
43745 KnownZero.lshrInPlace(ShiftAmt);
43746 KnownZero.setHighBits(ShiftAmt);
43747 break;
43748 }
43749 case X86ISD::ANDNP: {
43750 // ANDNP = (~LHS & RHS);
43751 SDValue LHS = Op.getOperand(0);
43752 SDValue RHS = Op.getOperand(1);
43753
43754 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
43755 APInt UndefElts;
43756 SmallVector<APInt> EltBits;
43757 int NumElts = VT.getVectorNumElements();
43758 int EltSizeInBits = VT.getScalarSizeInBits();
43759 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
43760 APInt OpElts = DemandedElts;
43761 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
43762 EltBits)) {
43763 OpBits.clearAllBits();
43764 OpElts.clearAllBits();
43765 for (int I = 0; I != NumElts; ++I) {
43766 if (!DemandedElts[I])
43767 continue;
43768 if (UndefElts[I]) {
43769 // We can't assume an undef src element gives an undef dst - the
43770 // other src might be zero.
43771 OpBits.setAllBits();
43772 OpElts.setBit(I);
43773 } else if ((Invert && !EltBits[I].isAllOnes()) ||
43774 (!Invert && !EltBits[I].isZero())) {
43775 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
43776 OpElts.setBit(I);
43777 }
43778 }
43779 }
43780 return std::make_pair(OpBits, OpElts);
43781 };
43782 APInt BitsLHS, EltsLHS;
43783 APInt BitsRHS, EltsRHS;
43784 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
43785 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
43786
43787 APInt LHSUndef, LHSZero;
43788 APInt RHSUndef, RHSZero;
43789 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
43790 Depth + 1))
43791 return true;
43792 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
43793 Depth + 1))
43794 return true;
43795
43796 if (!DemandedElts.isAllOnes()) {
43797 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
43798 TLO.DAG, Depth + 1);
43799 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
43800 TLO.DAG, Depth + 1);
43801 if (NewLHS || NewRHS) {
43802 NewLHS = NewLHS ? NewLHS : LHS;
43803 NewRHS = NewRHS ? NewRHS : RHS;
43804 return TLO.CombineTo(
43805 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43806 }
43807 }
43808 break;
43809 }
43810 case X86ISD::CVTSI2P:
43811 case X86ISD::CVTUI2P:
43812 case X86ISD::CVTPH2PS:
43813 case X86ISD::CVTPS2PH: {
43814 SDValue Src = Op.getOperand(0);
43815 EVT SrcVT = Src.getValueType();
43816 APInt SrcUndef, SrcZero;
43817 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43818 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43819 Depth + 1))
43820 return true;
43821 break;
43822 }
43823 case X86ISD::PACKSS:
43824 case X86ISD::PACKUS: {
43825 SDValue N0 = Op.getOperand(0);
43826 SDValue N1 = Op.getOperand(1);
43827
43828 APInt DemandedLHS, DemandedRHS;
43829 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43830
43831 APInt LHSUndef, LHSZero;
43832 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43833 Depth + 1))
43834 return true;
43835 APInt RHSUndef, RHSZero;
43836 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43837 Depth + 1))
43838 return true;
43839
43840 // TODO - pass on known zero/undef.
43841
43842 // Aggressively peek through ops to get at the demanded elts.
43843 // TODO - we should do this for all target/faux shuffles ops.
43844 if (!DemandedElts.isAllOnes()) {
43845 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43846 TLO.DAG, Depth + 1);
43847 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43848 TLO.DAG, Depth + 1);
43849 if (NewN0 || NewN1) {
43850 NewN0 = NewN0 ? NewN0 : N0;
43851 NewN1 = NewN1 ? NewN1 : N1;
43852 return TLO.CombineTo(Op,
43853 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43854 }
43855 }
43856 break;
43857 }
43858 case X86ISD::HADD:
43859 case X86ISD::HSUB:
43860 case X86ISD::FHADD:
43861 case X86ISD::FHSUB: {
43862 SDValue N0 = Op.getOperand(0);
43863 SDValue N1 = Op.getOperand(1);
43864
43865 APInt DemandedLHS, DemandedRHS;
43866 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43867
43868 APInt LHSUndef, LHSZero;
43869 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43870 Depth + 1))
43871 return true;
43872 APInt RHSUndef, RHSZero;
43873 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43874 Depth + 1))
43875 return true;
43876
43877 // TODO - pass on known zero/undef.
43878
43879 // Aggressively peek through ops to get at the demanded elts.
43880 // TODO: Handle repeated operands.
43881 if (N0 != N1 && !DemandedElts.isAllOnes()) {
43882 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43883 TLO.DAG, Depth + 1);
43884 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43885 TLO.DAG, Depth + 1);
43886 if (NewN0 || NewN1) {
43887 NewN0 = NewN0 ? NewN0 : N0;
43888 NewN1 = NewN1 ? NewN1 : N1;
43889 return TLO.CombineTo(Op,
43890 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43891 }
43892 }
43893 break;
43894 }
43895 case X86ISD::VTRUNC:
43896 case X86ISD::VTRUNCS:
43897 case X86ISD::VTRUNCUS: {
43898 SDValue Src = Op.getOperand(0);
43899 MVT SrcVT = Src.getSimpleValueType();
43900 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43901 APInt SrcUndef, SrcZero;
43902 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
43903 Depth + 1))
43904 return true;
43905 KnownZero = SrcZero.zextOrTrunc(NumElts);
43906 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
43907 break;
43908 }
43909 case X86ISD::BLENDI: {
43910 SmallVector<int, 16> BlendMask;
43911 DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
43913 VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask,
43914 DemandedElts, TLO.DAG, Subtarget, SDLoc(Op)))
43915 return TLO.CombineTo(Op, R);
43916 break;
43917 }
43918 case X86ISD::BLENDV: {
43919 APInt SelUndef, SelZero;
43920 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
43921 SelZero, TLO, Depth + 1))
43922 return true;
43923
43924 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
43925 APInt LHSUndef, LHSZero;
43926 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
43927 LHSZero, TLO, Depth + 1))
43928 return true;
43929
43930 APInt RHSUndef, RHSZero;
43931 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
43932 RHSZero, TLO, Depth + 1))
43933 return true;
43934
43935 KnownZero = LHSZero & RHSZero;
43936 KnownUndef = LHSUndef & RHSUndef;
43937 break;
43938 }
43939 case X86ISD::VZEXT_MOVL: {
43940 // If upper demanded elements are already zero then we have nothing to do.
43941 SDValue Src = Op.getOperand(0);
43942 APInt DemandedUpperElts = DemandedElts;
43943 DemandedUpperElts.clearLowBits(1);
43944 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
43945 return TLO.CombineTo(Op, Src);
43946 break;
43947 }
43948 case X86ISD::VZEXT_LOAD: {
43949 // If upper demanded elements are not demanded then simplify to a
43950 // scalar_to_vector(load()).
43952 if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {
43953 SDLoc DL(Op);
43954 auto *Mem = cast<MemSDNode>(Op);
43955 SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
43956 Mem->getMemOperand());
43957 SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);
43958 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));
43959 }
43960 break;
43961 }
43962 case X86ISD::VBROADCAST: {
43963 SDValue Src = Op.getOperand(0);
43964 MVT SrcVT = Src.getSimpleValueType();
43965 // Don't bother broadcasting if we just need the 0'th element.
43966 if (DemandedElts == 1) {
43967 if (!SrcVT.isVector())
43968 Src = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op), VT, Src);
43969 else if (Src.getValueType() != VT)
43970 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
43971 SDLoc(Op));
43972 return TLO.CombineTo(Op, Src);
43973 }
43974 if (!SrcVT.isVector())
43975 break;
43976 APInt SrcUndef, SrcZero;
43977 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
43978 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43979 Depth + 1))
43980 return true;
43981 // Aggressively peek through src to get at the demanded elt.
43982 // TODO - we should do this for all target/faux shuffles ops.
43984 Src, SrcElts, TLO.DAG, Depth + 1))
43985 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
43986 break;
43987 }
43988 case X86ISD::VPERMV:
43989 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
43990 Depth))
43991 return true;
43992 break;
43993 case X86ISD::PSHUFB:
43994 case X86ISD::VPERMV3:
43995 case X86ISD::VPERMILPV:
43996 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
43997 Depth))
43998 return true;
43999 break;
44000 case X86ISD::VPPERM:
44001 case X86ISD::VPERMIL2:
44002 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
44003 Depth))
44004 return true;
44005 break;
44006 }
44007
44008 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
44009 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
44010 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
44011 if ((VT.is256BitVector() || VT.is512BitVector()) &&
44012 DemandedElts.lshr(NumElts / 2) == 0) {
44013 unsigned SizeInBits = VT.getSizeInBits();
44014 unsigned ExtSizeInBits = SizeInBits / 2;
44015
44016 // See if 512-bit ops only use the bottom 128-bits.
44017 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
44018 ExtSizeInBits = SizeInBits / 4;
44019
44020 switch (Opc) {
44021 // Scalar broadcast.
44022 case X86ISD::VBROADCAST: {
44023 SDLoc DL(Op);
44024 SDValue Src = Op.getOperand(0);
44025 if (Src.getValueSizeInBits() > ExtSizeInBits)
44026 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
44027 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44028 ExtSizeInBits / VT.getScalarSizeInBits());
44029 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
44030 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44031 TLO.DAG, DL, ExtSizeInBits));
44032 }
44034 SDLoc DL(Op);
44035 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44036 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44037 ExtSizeInBits / VT.getScalarSizeInBits());
44038 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
44039 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
44040 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
44041 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
44042 MemIntr->getMemOperand());
44044 Bcst.getValue(1));
44045 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44046 TLO.DAG, DL, ExtSizeInBits));
44047 }
44048 // Subvector broadcast.
44050 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44051 EVT MemVT = MemIntr->getMemoryVT();
44052 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
44053 SDLoc DL(Op);
44054 SDValue Ld =
44055 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
44056 MemIntr->getBasePtr(), MemIntr->getMemOperand());
44058 Ld.getValue(1));
44059 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
44060 TLO.DAG, DL, ExtSizeInBits));
44061 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
44062 SDLoc DL(Op);
44063 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44064 ExtSizeInBits / VT.getScalarSizeInBits());
44065 if (SDValue BcstLd =
44066 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
44067 return TLO.CombineTo(Op,
44068 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
44069 TLO.DAG, DL, ExtSizeInBits));
44070 }
44071 break;
44072 }
44073 // Byte shifts by immediate.
44074 case X86ISD::VSHLDQ:
44075 case X86ISD::VSRLDQ:
44076 // Shift by uniform.
44077 case X86ISD::VSHL:
44078 case X86ISD::VSRL:
44079 case X86ISD::VSRA:
44080 // Shift by immediate.
44081 case X86ISD::VSHLI:
44082 case X86ISD::VSRLI:
44083 case X86ISD::VSRAI: {
44084 SDLoc DL(Op);
44085 SDValue Ext0 =
44086 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
44087 SDValue ExtOp =
44088 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
44089 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44090 SDValue Insert =
44091 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44092 return TLO.CombineTo(Op, Insert);
44093 }
44094 case X86ISD::VPERMI: {
44095 // Simplify 256-bit PERMPD/PERMQ to extract_subvector.
44096 // TODO: This should be done in shuffle combining.
44097 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
44099 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
44100 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
44101 SDLoc DL(Op);
44102 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
44103 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44104 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
44105 return TLO.CombineTo(Op, Insert);
44106 }
44107 }
44108 // Simplify 512-bit PERMPD/PERMQ to 256-bit variant on lower half.
44109 if (VT == MVT::v8f64 || VT == MVT::v8i64) {
44110 SDLoc DL(Op);
44111 SDValue Ext0 = extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, 256);
44112 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0,
44113 Op.getOperand(1));
44114 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44115 SDValue Insert = insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, 256);
44116 return TLO.CombineTo(Op, Insert);
44117 }
44118 break;
44119 }
44120 case X86ISD::VPERMV: {
44123 // We can always split v16i32/v16f32 AVX512 to v8i32/v8f32 AVX2 variants.
44124 if ((VT.is256BitVector() || Subtarget.hasVLX() || VT == MVT::v16i32 ||
44125 VT == MVT::v16f32) &&
44126 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44127 // For lane-crossing shuffles, only split in half in case we're still
44128 // referencing higher elements.
44129 unsigned HalfElts = NumElts / 2;
44130 unsigned HalfSize = SizeInBits / 2;
44131 Mask.resize(HalfElts);
44132 if (all_of(Mask,
44133 [&](int M) { return isUndefOrInRange(M, 0, HalfElts); })) {
44135 SDLoc DL(Op);
44136 SDValue Ext;
44137 SDValue M =
44138 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize);
44139 SDValue V =
44140 extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, HalfSize);
44141 // For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS.
44142 if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16)
44143 Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V);
44144 else {
44146 MVT ShufVT = HalfVT.changeVectorElementType(ShufSVT);
44147 Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, ShufVT,
44148 TLO.DAG.getBitcast(ShufVT, V), M);
44149 Ext = TLO.DAG.getBitcast(HalfVT, Ext);
44150 }
44151 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44152 Subtarget, TLO.DAG, DL, SizeInBits);
44153 return TLO.CombineTo(Op, Insert);
44154 }
44155 }
44156 break;
44157 }
44158 case X86ISD::VPERMV3: {
44161 if (Subtarget.hasVLX() &&
44162 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44163 // For lane-crossing shuffles, only split in half in case we're still
44164 // referencing higher elements.
44165 unsigned HalfElts = NumElts / 2;
44166 unsigned HalfSize = SizeInBits / 2;
44167 Mask.resize(HalfElts);
44168 if (all_of(Mask, [&](int M) {
44169 return isUndefOrInRange(M, 0, HalfElts) ||
44170 isUndefOrInRange(M, NumElts, NumElts + HalfElts);
44171 })) {
44172 // Adjust mask elements for 2nd operand to point to half width.
44173 for (int &M : Mask)
44174 M = (M < NumElts) ? M : (M - HalfElts);
44176 MVT HalfIntVT = HalfVT.changeVectorElementTypeToInteger();
44177 SDLoc DL(Op);
44178 SDValue Ext = TLO.DAG.getNode(
44179 Opc, DL, HalfVT,
44180 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize),
44181 getConstVector(Mask, HalfIntVT, TLO.DAG, DL, /*IsMask=*/true),
44182 extractSubVector(Op.getOperand(2), 0, TLO.DAG, DL, HalfSize));
44183 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44184 Subtarget, TLO.DAG, DL, SizeInBits);
44185 return TLO.CombineTo(Op, Insert);
44186 }
44187 }
44188 break;
44189 }
44190 case X86ISD::VPERM2X128: {
44191 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
44192 SDLoc DL(Op);
44193 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
44194 if (LoMask & 0x8)
44195 return TLO.CombineTo(
44196 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
44197 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
44198 unsigned SrcIdx = (LoMask & 0x2) >> 1;
44199 SDValue ExtOp =
44200 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
44201 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44202 SDValue Insert =
44203 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44204 return TLO.CombineTo(Op, Insert);
44205 }
44206 // Conversions.
44207 // TODO: Add more CVT opcodes when we have test coverage.
44208 case X86ISD::CVTTP2UI: {
44209 if (!Subtarget.hasVLX())
44210 break;
44211 [[fallthrough]];
44212 }
44213 case X86ISD::CVTTP2SI: {
44214 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f16 &&
44215 !Subtarget.hasVLX())
44216 break;
44217 [[fallthrough]];
44218 }
44219 case X86ISD::CVTPH2PS: {
44220 SDLoc DL(Op);
44221 unsigned Scale = SizeInBits / ExtSizeInBits;
44222 SDValue SrcOp = Op.getOperand(0);
44223 MVT SrcVT = SrcOp.getSimpleValueType();
44224 unsigned SrcExtSize =
44225 std::max<unsigned>(SrcVT.getSizeInBits() / Scale, 128);
44227 ExtSizeInBits / VT.getScalarSizeInBits());
44228 SDValue ExtOp = TLO.DAG.getNode(
44229 Opc, DL, ExtVT, extractSubVector(SrcOp, 0, TLO.DAG, DL, SrcExtSize));
44230 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44231 SDValue Insert =
44232 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44233 return TLO.CombineTo(Op, Insert);
44234 }
44235 // Zero upper elements.
44236 case X86ISD::VZEXT_MOVL:
44237 // Variable blend.
44238 case X86ISD::BLENDV:
44239 // Target unary shuffles:
44240 case X86ISD::MOVDDUP:
44241 // Target unary shuffles by immediate:
44242 case X86ISD::PSHUFD:
44243 case X86ISD::PSHUFLW:
44244 case X86ISD::PSHUFHW:
44245 case X86ISD::VPERMILPI:
44246 // (Non-Lane Crossing) Target Shuffles.
44247 case X86ISD::VPERMILPV:
44248 case X86ISD::VPERMIL2:
44249 case X86ISD::PSHUFB:
44250 case X86ISD::UNPCKL:
44251 case X86ISD::UNPCKH:
44252 case X86ISD::BLENDI:
44253 // Integer ops.
44254 case X86ISD::PACKSS:
44255 case X86ISD::PACKUS:
44256 case X86ISD::PCMPEQ:
44257 case X86ISD::PCMPGT:
44258 case X86ISD::PMULUDQ:
44259 case X86ISD::PMULDQ:
44260 case X86ISD::VSHLV:
44261 case X86ISD::VSRLV:
44262 case X86ISD::VSRAV:
44263 // Float ops.
44264 case X86ISD::FMAX:
44265 case X86ISD::FMIN:
44266 case X86ISD::FMAXC:
44267 case X86ISD::FMINC:
44268 case X86ISD::FRSQRT:
44269 case X86ISD::FRCP:
44270 // Horizontal Ops.
44271 case X86ISD::HADD:
44272 case X86ISD::HSUB:
44273 case X86ISD::FHADD:
44274 case X86ISD::FHSUB: {
44275 SDLoc DL(Op);
44277 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
44278 SDValue SrcOp = Op.getOperand(i);
44279 EVT SrcVT = SrcOp.getValueType();
44280 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
44281 "Unsupported vector size");
44282 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
44283 ExtSizeInBits)
44284 : SrcOp);
44285 }
44286 MVT ExtVT = VT.getSimpleVT();
44287 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
44288 ExtSizeInBits / ExtVT.getScalarSizeInBits());
44289 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
44290 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44291 SDValue Insert =
44292 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44293 return TLO.CombineTo(Op, Insert);
44294 }
44295 }
44296 }
44297
44298 // For splats, unless we *only* demand the 0'th element,
44299 // stop attempts at simplification here, we aren't going to improve things,
44300 // this is better than any potential shuffle.
44301 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
44302 return false;
44303
44304 // Get target/faux shuffle mask.
44305 APInt OpUndef, OpZero;
44306 SmallVector<int, 64> OpMask;
44307 SmallVector<SDValue, 2> OpInputs;
44308 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
44309 OpZero, TLO.DAG, Depth, false))
44310 return false;
44311
44312 // Shuffle inputs must be the same size as the result.
44313 if (OpMask.size() != (unsigned)NumElts ||
44314 llvm::any_of(OpInputs, [VT](SDValue V) {
44315 return VT.getSizeInBits() != V.getValueSizeInBits() ||
44316 !V.getValueType().isVector();
44317 }))
44318 return false;
44319
44320 KnownZero = OpZero;
44321 KnownUndef = OpUndef;
44322
44323 // Check if shuffle mask can be simplified to undef/zero/identity.
44324 int NumSrcs = OpInputs.size();
44325 for (int i = 0; i != NumElts; ++i)
44326 if (!DemandedElts[i])
44327 OpMask[i] = SM_SentinelUndef;
44328
44329 if (isUndefInRange(OpMask, 0, NumElts)) {
44330 KnownUndef.setAllBits();
44331 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
44332 }
44333 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
44334 KnownZero.setAllBits();
44335 return TLO.CombineTo(
44336 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
44337 }
44338 for (int Src = 0; Src != NumSrcs; ++Src)
44339 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
44340 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
44341
44342 // Attempt to simplify inputs.
44343 for (int Src = 0; Src != NumSrcs; ++Src) {
44344 // TODO: Support inputs of different types.
44345 if (OpInputs[Src].getValueType() != VT)
44346 continue;
44347
44348 int Lo = Src * NumElts;
44349 APInt SrcElts = APInt::getZero(NumElts);
44350 for (int i = 0; i != NumElts; ++i)
44351 if (DemandedElts[i]) {
44352 int M = OpMask[i] - Lo;
44353 if (0 <= M && M < NumElts)
44354 SrcElts.setBit(M);
44355 }
44356
44357 // TODO - Propagate input undef/zero elts.
44358 APInt SrcUndef, SrcZero;
44359 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
44360 TLO, Depth + 1))
44361 return true;
44362 }
44363
44364 // If we don't demand all elements, then attempt to combine to a simpler
44365 // shuffle.
44366 // We need to convert the depth to something combineX86ShufflesRecursively
44367 // can handle - so pretend its Depth == 0 again, and reduce the max depth
44368 // to match. This prevents combineX86ShuffleChain from returning a
44369 // combined shuffle that's the same as the original root, causing an
44370 // infinite loop.
44371 if (!DemandedElts.isAllOnes()) {
44372 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
44373
44374 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
44375 for (int i = 0; i != NumElts; ++i)
44376 if (DemandedElts[i])
44377 DemandedMask[i] = i;
44378
44380 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), DemandedMask, {}, 0,
44382 /*AllowVariableCrossLaneMask=*/true,
44383 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget),
44384 TLO.DAG, SDLoc(Op), Subtarget);
44385 if (NewShuffle)
44386 return TLO.CombineTo(Op, NewShuffle);
44387 }
44388
44389 return false;
44390}
44391
44393 SDValue Op, const APInt &OriginalDemandedBits,
44394 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
44395 unsigned Depth) const {
44396 EVT VT = Op.getValueType();
44397 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
44398 unsigned Opc = Op.getOpcode();
44399 switch(Opc) {
44400 case X86ISD::VTRUNC: {
44401 KnownBits KnownOp;
44402 SDValue Src = Op.getOperand(0);
44403 MVT SrcVT = Src.getSimpleValueType();
44404
44405 // Simplify the input, using demanded bit information.
44406 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
44407 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
44408 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
44409 return true;
44410 break;
44411 }
44412 case X86ISD::PMULDQ:
44413 case X86ISD::PMULUDQ: {
44414 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
44415 KnownBits KnownLHS, KnownRHS;
44416 SDValue LHS = Op.getOperand(0);
44417 SDValue RHS = Op.getOperand(1);
44418
44419 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
44420 // FIXME: Can we bound this better?
44421 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
44422 APInt DemandedMaskLHS = APInt::getAllOnes(64);
44423 APInt DemandedMaskRHS = APInt::getAllOnes(64);
44424
44425 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
44426 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
44427 DemandedMaskLHS = DemandedMask;
44428 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
44429 DemandedMaskRHS = DemandedMask;
44430
44431 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
44432 KnownLHS, TLO, Depth + 1))
44433 return true;
44434 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
44435 KnownRHS, TLO, Depth + 1))
44436 return true;
44437
44438 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
44439 KnownRHS = KnownRHS.trunc(32);
44440 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
44441 KnownRHS.getConstant().isOne()) {
44442 SDLoc DL(Op);
44443 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
44444 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
44445 }
44446
44447 // Aggressively peek through ops to get at the demanded low bits.
44449 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44451 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44452 if (DemandedLHS || DemandedRHS) {
44453 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
44454 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
44455 return TLO.CombineTo(
44456 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
44457 }
44458 break;
44459 }
44460 case X86ISD::ANDNP: {
44461 KnownBits Known2;
44462 SDValue Op0 = Op.getOperand(0);
44463 SDValue Op1 = Op.getOperand(1);
44464
44465 if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
44466 Known, TLO, Depth + 1))
44467 return true;
44468
44469 if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
44470 OriginalDemandedElts, Known2, TLO, Depth + 1))
44471 return true;
44472
44473 // If the RHS is a constant, see if we can simplify it.
44474 if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
44475 OriginalDemandedElts, TLO))
44476 return true;
44477
44478 // ANDNP = (~Op0 & Op1);
44479 Known.One &= Known2.Zero;
44480 Known.Zero |= Known2.One;
44481 break;
44482 }
44483 case X86ISD::VSHLI: {
44484 SDValue Op0 = Op.getOperand(0);
44485 SDValue Op1 = Op.getOperand(1);
44486
44487 unsigned ShAmt = Op1->getAsZExtVal();
44488 if (ShAmt >= BitWidth)
44489 break;
44490
44491 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
44492
44493 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
44494 // single shift. We can do this if the bottom bits (which are shifted
44495 // out) are never demanded.
44496 if (Op0.getOpcode() == X86ISD::VSRLI &&
44497 OriginalDemandedBits.countr_zero() >= ShAmt) {
44498 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
44499 if (Shift2Amt < BitWidth) {
44500 int Diff = ShAmt - Shift2Amt;
44501 if (Diff == 0)
44502 return TLO.CombineTo(Op, Op0.getOperand(0));
44503
44504 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
44505 SDValue NewShift = TLO.DAG.getNode(
44506 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
44507 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
44508 return TLO.CombineTo(Op, NewShift);
44509 }
44510 }
44511
44512 // If we are only demanding sign bits then we can use the shift source directly.
44513 unsigned NumSignBits =
44514 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
44515 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
44516 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
44517 return TLO.CombineTo(Op, Op0);
44518
44519 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44520 TLO, Depth + 1))
44521 return true;
44522
44523 Known <<= ShAmt;
44524
44525 // Low bits known zero.
44526 Known.Zero.setLowBits(ShAmt);
44527
44528 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44529 // Attempt to avoid multi-use ops if we don't need anything from them.
44530 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44531 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44532 SDValue NewOp =
44533 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44534 return TLO.CombineTo(Op, NewOp);
44535 }
44536 }
44537 return false;
44538 }
44539 case X86ISD::VSRLI: {
44540 SDValue Op0 = Op.getOperand(0);
44541 SDValue Op1 = Op.getOperand(1);
44542
44543 unsigned ShAmt = Op1->getAsZExtVal();
44544 if (ShAmt >= BitWidth)
44545 break;
44546
44547 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44548
44549 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44550 TLO, Depth + 1))
44551 return true;
44552
44553 Known >>= ShAmt;
44554
44555 // High bits known zero.
44556 Known.Zero.setHighBits(ShAmt);
44557
44558 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44559 // Attempt to avoid multi-use ops if we don't need anything from them.
44560 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44561 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44562 SDValue NewOp =
44563 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44564 return TLO.CombineTo(Op, NewOp);
44565 }
44566 }
44567 return false;
44568 }
44569 case X86ISD::VSRAI: {
44570 SDValue Op0 = Op.getOperand(0);
44571 SDValue Op1 = Op.getOperand(1);
44572
44573 unsigned ShAmt = Op1->getAsZExtVal();
44574 if (ShAmt >= BitWidth)
44575 break;
44576
44577 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44578
44579 // If we just want the sign bit then we don't need to shift it.
44580 if (OriginalDemandedBits.isSignMask())
44581 return TLO.CombineTo(Op, Op0);
44582
44583 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
44584 if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
44585 SDValue Op00 = Op0.getOperand(0);
44586 unsigned NumSignBits =
44587 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
44588 if (ShAmt < NumSignBits)
44589 return TLO.CombineTo(Op, Op00);
44590 }
44591
44592 // If any of the demanded bits are produced by the sign extension, we also
44593 // demand the input sign bit.
44594 if (OriginalDemandedBits.countl_zero() < ShAmt)
44595 DemandedMask.setSignBit();
44596
44597 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44598 TLO, Depth + 1))
44599 return true;
44600
44601 Known >>= ShAmt;
44602
44603 // If the input sign bit is known to be zero, or if none of the top bits
44604 // are demanded, turn this into an unsigned shift right.
44605 if (Known.Zero[BitWidth - ShAmt - 1] ||
44606 OriginalDemandedBits.countl_zero() >= ShAmt)
44607 return TLO.CombineTo(
44608 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
44609
44610 // High bits are known one.
44611 if (Known.One[BitWidth - ShAmt - 1])
44612 Known.One.setHighBits(ShAmt);
44613
44614 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44615 // Attempt to avoid multi-use ops if we don't need anything from them.
44616 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44617 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44618 SDValue NewOp =
44619 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44620 return TLO.CombineTo(Op, NewOp);
44621 }
44622 }
44623 return false;
44624 }
44625 case X86ISD::BLENDI: {
44626 SDValue LHS = Op.getOperand(0);
44627 SDValue RHS = Op.getOperand(1);
44628 APInt Mask = getBLENDIBlendMask(Op);
44629
44630 APInt DemandedEltsLHS = OriginalDemandedElts & ~Mask;
44631 if (SimplifyDemandedBits(LHS, OriginalDemandedBits, DemandedEltsLHS, Known,
44632 TLO, Depth + 1))
44633 return true;
44634
44635 APInt DemandedEltsRHS = OriginalDemandedElts & Mask;
44636 if (SimplifyDemandedBits(RHS, OriginalDemandedBits, DemandedEltsRHS, Known,
44637 TLO, Depth + 1))
44638 return true;
44639
44640 // Attempt to avoid multi-use ops if we don't need anything from them.
44642 LHS, OriginalDemandedBits, DemandedEltsLHS, TLO.DAG, Depth + 1);
44644 RHS, OriginalDemandedBits, DemandedEltsRHS, TLO.DAG, Depth + 1);
44645 if (NewLHS || NewRHS) {
44646 NewLHS = NewLHS ? NewLHS : LHS;
44647 NewRHS = NewRHS ? NewRHS : RHS;
44648 return TLO.CombineTo(Op,
44649 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT,
44650 NewLHS, NewRHS, Op.getOperand(2)));
44651 }
44652 break;
44653 }
44654 case X86ISD::BLENDV: {
44655 SDValue Sel = Op.getOperand(0);
44656 SDValue LHS = Op.getOperand(1);
44657 SDValue RHS = Op.getOperand(2);
44658
44659 APInt SignMask = APInt::getSignMask(BitWidth);
44661 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
44663 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44665 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44666
44667 if (NewSel || NewLHS || NewRHS) {
44668 NewSel = NewSel ? NewSel : Sel;
44669 NewLHS = NewLHS ? NewLHS : LHS;
44670 NewRHS = NewRHS ? NewRHS : RHS;
44671 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
44672 NewSel, NewLHS, NewRHS));
44673 }
44674 break;
44675 }
44676 case X86ISD::PEXTRB:
44677 case X86ISD::PEXTRW: {
44678 SDValue Vec = Op.getOperand(0);
44679 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
44680 MVT VecVT = Vec.getSimpleValueType();
44681 unsigned NumVecElts = VecVT.getVectorNumElements();
44682
44683 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
44684 unsigned Idx = CIdx->getZExtValue();
44685 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
44686
44687 // If we demand no bits from the vector then we must have demanded
44688 // bits from the implict zext - simplify to zero.
44689 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
44690 if (DemandedVecBits == 0)
44691 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44692
44693 APInt KnownUndef, KnownZero;
44694 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
44695 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
44696 KnownZero, TLO, Depth + 1))
44697 return true;
44698
44699 KnownBits KnownVec;
44700 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
44701 KnownVec, TLO, Depth + 1))
44702 return true;
44703
44705 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
44706 return TLO.CombineTo(
44707 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
44708
44709 Known = KnownVec.zext(BitWidth);
44710 return false;
44711 }
44712 break;
44713 }
44714 case X86ISD::PINSRB:
44715 case X86ISD::PINSRW: {
44716 SDValue Vec = Op.getOperand(0);
44717 SDValue Scl = Op.getOperand(1);
44718 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
44719 MVT VecVT = Vec.getSimpleValueType();
44720
44721 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
44722 unsigned Idx = CIdx->getZExtValue();
44723 if (!OriginalDemandedElts[Idx])
44724 return TLO.CombineTo(Op, Vec);
44725
44726 KnownBits KnownVec;
44727 APInt DemandedVecElts(OriginalDemandedElts);
44728 DemandedVecElts.clearBit(Idx);
44729 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
44730 KnownVec, TLO, Depth + 1))
44731 return true;
44732
44733 KnownBits KnownScl;
44734 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
44735 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
44736 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
44737 return true;
44738
44739 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
44740 Known = KnownVec.intersectWith(KnownScl);
44741 return false;
44742 }
44743 break;
44744 }
44745 case X86ISD::PACKSS:
44746 // PACKSS saturates to MIN/MAX integer values. So if we just want the
44747 // sign bit then we can just ask for the source operands sign bit.
44748 // TODO - add known bits handling.
44749 if (OriginalDemandedBits.isSignMask()) {
44750 APInt DemandedLHS, DemandedRHS;
44751 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
44752
44753 KnownBits KnownLHS, KnownRHS;
44754 APInt SignMask = APInt::getSignMask(BitWidth * 2);
44755 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
44756 KnownLHS, TLO, Depth + 1))
44757 return true;
44758 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
44759 KnownRHS, TLO, Depth + 1))
44760 return true;
44761
44762 // Attempt to avoid multi-use ops if we don't need anything from them.
44764 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
44766 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
44767 if (DemandedOp0 || DemandedOp1) {
44768 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
44769 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
44770 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
44771 }
44772 }
44773 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
44774 break;
44775 case X86ISD::VBROADCAST: {
44776 SDValue Src = Op.getOperand(0);
44777 MVT SrcVT = Src.getSimpleValueType();
44778 APInt DemandedElts = APInt::getOneBitSet(
44779 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
44780 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
44781 TLO, Depth + 1))
44782 return true;
44783 // If we don't need the upper bits, attempt to narrow the broadcast source.
44784 // Don't attempt this on AVX512 as it might affect broadcast folding.
44785 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
44786 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
44787 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
44788 Src->hasOneUse()) {
44789 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
44790 SDValue NewSrc =
44791 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
44792 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
44793 SDValue NewBcst =
44794 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
44795 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
44796 }
44797 break;
44798 }
44799 case X86ISD::PCMPGT:
44800 // icmp sgt(0, R) == ashr(R, BitWidth-1).
44801 // iff we only need the sign bit then we can use R directly.
44802 if (OriginalDemandedBits.isSignMask() &&
44803 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
44804 return TLO.CombineTo(Op, Op.getOperand(1));
44805 break;
44806 case X86ISD::MOVMSK: {
44807 SDValue Src = Op.getOperand(0);
44808 MVT SrcVT = Src.getSimpleValueType();
44809 unsigned SrcBits = SrcVT.getScalarSizeInBits();
44810 unsigned NumElts = SrcVT.getVectorNumElements();
44811
44812 // If we don't need the sign bits at all just return zero.
44813 if (OriginalDemandedBits.countr_zero() >= NumElts)
44814 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44815
44816 // See if we only demand bits from the lower 128-bit vector.
44817 if (SrcVT.is256BitVector() &&
44818 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
44819 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
44820 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44821 }
44822
44823 // Only demand the vector elements of the sign bits we need.
44824 APInt KnownUndef, KnownZero;
44825 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
44826 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
44827 TLO, Depth + 1))
44828 return true;
44829
44830 Known.Zero = KnownZero.zext(BitWidth);
44831 Known.Zero.setHighBits(BitWidth - NumElts);
44832
44833 // MOVMSK only uses the MSB from each vector element.
44834 KnownBits KnownSrc;
44835 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
44836 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
44837 Depth + 1))
44838 return true;
44839
44840 if (KnownSrc.One[SrcBits - 1])
44841 Known.One.setLowBits(NumElts);
44842 else if (KnownSrc.Zero[SrcBits - 1])
44843 Known.Zero.setLowBits(NumElts);
44844
44845 // Attempt to avoid multi-use os if we don't need anything from it.
44847 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
44848 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44849 return false;
44850 }
44851 case X86ISD::TESTP: {
44852 SDValue Op0 = Op.getOperand(0);
44853 SDValue Op1 = Op.getOperand(1);
44854 MVT OpVT = Op0.getSimpleValueType();
44855 assert((OpVT.getVectorElementType() == MVT::f32 ||
44856 OpVT.getVectorElementType() == MVT::f64) &&
44857 "Illegal vector type for X86ISD::TESTP");
44858
44859 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
44860 KnownBits KnownSrc;
44861 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
44862 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
44863 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
44864 AssumeSingleUse) ||
44865 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
44866 AssumeSingleUse);
44867 }
44868 case X86ISD::CMOV: {
44869 KnownBits Known2;
44870 if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
44871 OriginalDemandedElts, Known2, TLO, Depth + 1))
44872 return true;
44873 if (SimplifyDemandedBits(Op.getOperand(0), OriginalDemandedBits,
44874 OriginalDemandedElts, Known, TLO, Depth + 1))
44875 return true;
44876
44877 // Only known if known in both the LHS and RHS.
44878 Known = Known.intersectWith(Known2);
44879 return false;
44880 }
44881 case X86ISD::BEXTR:
44882 case X86ISD::BEXTRI: {
44883 SDValue Op0 = Op.getOperand(0);
44884 SDValue Op1 = Op.getOperand(1);
44885
44886 // Only bottom 16-bits of the control bits are required.
44887 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
44888 // NOTE: SimplifyDemandedBits won't do this for constants.
44889 uint64_t Val1 = Cst1->getZExtValue();
44890 uint64_t MaskedVal1 = Val1 & 0xFFFF;
44891 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
44892 SDLoc DL(Op);
44893 return TLO.CombineTo(
44894 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
44895 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
44896 }
44897
44898 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
44899 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
44900
44901 // If the length is 0, the result is 0.
44902 if (Length == 0) {
44903 Known.setAllZero();
44904 return false;
44905 }
44906
44907 if ((Shift + Length) <= BitWidth) {
44908 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
44909 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
44910 return true;
44911
44912 Known = Known.extractBits(Length, Shift);
44913 Known = Known.zextOrTrunc(BitWidth);
44914 return false;
44915 }
44916 } else {
44917 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
44918 KnownBits Known1;
44919 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
44920 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
44921 return true;
44922
44923 // If the length is 0, replace with 0.
44924 KnownBits LengthBits = Known1.extractBits(8, 8);
44925 if (LengthBits.isZero())
44926 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44927 }
44928
44929 break;
44930 }
44931 case X86ISD::PDEP: {
44932 SDValue Op0 = Op.getOperand(0);
44933 SDValue Op1 = Op.getOperand(1);
44934
44935 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
44936 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
44937
44938 // If the demanded bits has leading zeroes, we don't demand those from the
44939 // mask.
44940 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
44941 return true;
44942
44943 // The number of possible 1s in the mask determines the number of LSBs of
44944 // operand 0 used. Undemanded bits from the mask don't matter so filter
44945 // them before counting.
44946 KnownBits Known2;
44947 uint64_t Count = (~Known.Zero & LoMask).popcount();
44948 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
44949 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
44950 return true;
44951
44952 // Zeroes are retained from the mask, but not ones.
44953 Known.One.clearAllBits();
44954 // The result will have at least as many trailing zeros as the non-mask
44955 // operand since bits can only map to the same or higher bit position.
44956 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
44957 return false;
44958 }
44959 case X86ISD::VPMADD52L:
44960 case X86ISD::VPMADD52H: {
44961 KnownBits KnownOp0, KnownOp1, KnownOp2;
44962 SDValue Op0 = Op.getOperand(0);
44963 SDValue Op1 = Op.getOperand(1);
44964 SDValue Op2 = Op.getOperand(2);
44965 // Only demand the lower 52-bits of operands 0 / 1 (and all 64-bits of
44966 // operand 2).
44967 APInt Low52Bits = APInt::getLowBitsSet(BitWidth, 52);
44968 if (SimplifyDemandedBits(Op0, Low52Bits, OriginalDemandedElts, KnownOp0,
44969 TLO, Depth + 1))
44970 return true;
44971
44972 if (SimplifyDemandedBits(Op1, Low52Bits, OriginalDemandedElts, KnownOp1,
44973 TLO, Depth + 1))
44974 return true;
44975
44976 if (SimplifyDemandedBits(Op2, APInt::getAllOnes(64), OriginalDemandedElts,
44977 KnownOp2, TLO, Depth + 1))
44978 return true;
44979
44980 KnownBits KnownMul;
44981 KnownOp0 = KnownOp0.trunc(52);
44982 KnownOp1 = KnownOp1.trunc(52);
44983 KnownMul = Opc == X86ISD::VPMADD52L ? KnownBits::mul(KnownOp0, KnownOp1)
44984 : KnownBits::mulhu(KnownOp0, KnownOp1);
44985 KnownMul = KnownMul.zext(64);
44986
44987 // lo/hi(X * Y) + Z --> C + Z
44988 if (KnownMul.isConstant()) {
44989 SDLoc DL(Op);
44990 SDValue C = TLO.DAG.getConstant(KnownMul.getConstant(), DL, VT);
44991 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ADD, DL, VT, C, Op2));
44992 }
44993
44994 Known = KnownBits::add(KnownMul, KnownOp2);
44995 return false;
44996 }
44997 }
44998
45000 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
45001}
45002
45004 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
45005 SelectionDAG &DAG, unsigned Depth) const {
45006 int NumElts = DemandedElts.getBitWidth();
45007 unsigned Opc = Op.getOpcode();
45008 EVT VT = Op.getValueType();
45009
45010 switch (Opc) {
45011 case X86ISD::PINSRB:
45012 case X86ISD::PINSRW: {
45013 // If we don't demand the inserted element, return the base vector.
45014 SDValue Vec = Op.getOperand(0);
45015 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
45016 MVT VecVT = Vec.getSimpleValueType();
45017 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
45018 !DemandedElts[CIdx->getZExtValue()])
45019 return Vec;
45020 break;
45021 }
45022 case X86ISD::VSHLI: {
45023 // If we are only demanding sign bits then we can use the shift source
45024 // directly.
45025 SDValue Op0 = Op.getOperand(0);
45026 unsigned ShAmt = Op.getConstantOperandVal(1);
45027 unsigned BitWidth = DemandedBits.getBitWidth();
45028 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
45029 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
45030 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
45031 return Op0;
45032 break;
45033 }
45034 case X86ISD::VSRAI:
45035 // iff we only need the sign bit then we can use the source directly.
45036 // TODO: generalize where we only demand extended signbits.
45037 if (DemandedBits.isSignMask())
45038 return Op.getOperand(0);
45039 break;
45040 case X86ISD::PCMPGT:
45041 // icmp sgt(0, R) == ashr(R, BitWidth-1).
45042 // iff we only need the sign bit then we can use R directly.
45043 if (DemandedBits.isSignMask() &&
45044 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
45045 return Op.getOperand(1);
45046 break;
45047 case X86ISD::BLENDV: {
45048 // BLENDV: Cond (MSB) ? LHS : RHS
45049 SDValue Cond = Op.getOperand(0);
45050 SDValue LHS = Op.getOperand(1);
45051 SDValue RHS = Op.getOperand(2);
45052
45053 KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
45054 if (CondKnown.isNegative())
45055 return LHS;
45056 if (CondKnown.isNonNegative())
45057 return RHS;
45058 break;
45059 }
45060 case X86ISD::ANDNP: {
45061 // ANDNP = (~LHS & RHS);
45062 SDValue LHS = Op.getOperand(0);
45063 SDValue RHS = Op.getOperand(1);
45064
45065 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
45066 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
45067
45068 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
45069 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
45070 // this context, so return RHS.
45071 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
45072 return RHS;
45073 break;
45074 }
45075 }
45076
45077 APInt ShuffleUndef, ShuffleZero;
45078 SmallVector<int, 16> ShuffleMask;
45080 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
45081 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
45082 // If all the demanded elts are from one operand and are inline,
45083 // then we can use the operand directly.
45084 int NumOps = ShuffleOps.size();
45085 if (ShuffleMask.size() == (unsigned)NumElts &&
45087 return VT.getSizeInBits() == V.getValueSizeInBits();
45088 })) {
45089
45090 if (DemandedElts.isSubsetOf(ShuffleUndef))
45091 return DAG.getUNDEF(VT);
45092 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
45093 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
45094
45095 // Bitmask that indicates which ops have only been accessed 'inline'.
45096 APInt IdentityOp = APInt::getAllOnes(NumOps);
45097 for (int i = 0; i != NumElts; ++i) {
45098 int M = ShuffleMask[i];
45099 if (!DemandedElts[i] || ShuffleUndef[i])
45100 continue;
45101 int OpIdx = M / NumElts;
45102 int EltIdx = M % NumElts;
45103 if (M < 0 || EltIdx != i) {
45104 IdentityOp.clearAllBits();
45105 break;
45106 }
45107 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
45108 if (IdentityOp == 0)
45109 break;
45110 }
45111 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
45112 "Multiple identity shuffles detected");
45113
45114 if (IdentityOp != 0)
45115 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
45116 }
45117 }
45118
45120 Op, DemandedBits, DemandedElts, DAG, Depth);
45121}
45122
45124 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45125 bool PoisonOnly, unsigned Depth) const {
45126 unsigned NumElts = DemandedElts.getBitWidth();
45127
45128 switch (Op.getOpcode()) {
45130 case X86ISD::Wrapper:
45131 case X86ISD::WrapperRIP:
45132 return true;
45133 case X86ISD::BLENDI:
45134 case X86ISD::PSHUFD:
45135 case X86ISD::UNPCKL:
45136 case X86ISD::UNPCKH:
45137 case X86ISD::VPERMILPI:
45138 case X86ISD::VPERMV3: {
45141 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
45142 SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),
45143 APInt::getZero(NumElts));
45144 for (auto M : enumerate(Mask)) {
45145 if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)
45146 continue;
45147 if (M.value() == SM_SentinelUndef)
45148 return false;
45149 assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&
45150 "Shuffle mask index out of range");
45151 DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);
45152 }
45153 for (auto Op : enumerate(Ops))
45154 if (!DemandedSrcElts[Op.index()].isZero() &&
45156 Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
45157 return false;
45158 return true;
45159 }
45160 break;
45161 }
45162 }
45164 Op, DemandedElts, DAG, PoisonOnly, Depth);
45165}
45166
45168 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45169 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
45170
45171 switch (Op.getOpcode()) {
45172 // SSE bit logic.
45173 case X86ISD::FAND:
45174 case X86ISD::FOR:
45175 case X86ISD::FXOR:
45176 case X86ISD::FANDN:
45177 case X86ISD::ANDNP:
45178 case X86ISD::VPTERNLOG:
45179 return false;
45180 // SSE vector insert/extracts use modulo indices.
45181 case X86ISD::PINSRB:
45182 case X86ISD::PINSRW:
45183 case X86ISD::PEXTRB:
45184 case X86ISD::PEXTRW:
45185 return false;
45186 // SSE vector multiplies are either inbounds or saturate.
45187 case X86ISD::VPMADDUBSW:
45188 case X86ISD::VPMADDWD:
45189 return false;
45190 // SSE vector shifts handle out of bounds shift amounts.
45191 case X86ISD::VSHLI:
45192 case X86ISD::VSRLI:
45193 case X86ISD::VSRAI:
45194 return false;
45195 // SSE blends.
45196 case X86ISD::BLENDI:
45197 case X86ISD::BLENDV:
45198 return false;
45199 // SSE target shuffles.
45200 case X86ISD::PSHUFD:
45201 case X86ISD::UNPCKL:
45202 case X86ISD::UNPCKH:
45203 case X86ISD::VPERMILPI:
45204 case X86ISD::VPERMV3:
45205 return false;
45206 // SSE comparisons handle all icmp/fcmp cases.
45207 // TODO: Add CMPM/MM with test coverage.
45208 case X86ISD::CMPP:
45209 case X86ISD::PCMPEQ:
45210 case X86ISD::PCMPGT:
45211 return false;
45212 // SSE signbit extraction.
45213 case X86ISD::MOVMSK:
45214 return false;
45215 // GFNI instructions.
45218 case X86ISD::GF2P8MULB:
45219 return false;
45221 switch (Op->getConstantOperandVal(0)) {
45222 case Intrinsic::x86_sse2_pmadd_wd:
45223 case Intrinsic::x86_avx2_pmadd_wd:
45224 case Intrinsic::x86_avx512_pmaddw_d_512:
45225 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
45226 case Intrinsic::x86_avx2_pmadd_ub_sw:
45227 case Intrinsic::x86_avx512_pmaddubs_w_512:
45228 return false;
45229 case Intrinsic::x86_avx512_vpermi2var_d_128:
45230 case Intrinsic::x86_avx512_vpermi2var_d_256:
45231 case Intrinsic::x86_avx512_vpermi2var_d_512:
45232 case Intrinsic::x86_avx512_vpermi2var_hi_128:
45233 case Intrinsic::x86_avx512_vpermi2var_hi_256:
45234 case Intrinsic::x86_avx512_vpermi2var_hi_512:
45235 case Intrinsic::x86_avx512_vpermi2var_pd_128:
45236 case Intrinsic::x86_avx512_vpermi2var_pd_256:
45237 case Intrinsic::x86_avx512_vpermi2var_pd_512:
45238 case Intrinsic::x86_avx512_vpermi2var_ps_128:
45239 case Intrinsic::x86_avx512_vpermi2var_ps_256:
45240 case Intrinsic::x86_avx512_vpermi2var_ps_512:
45241 case Intrinsic::x86_avx512_vpermi2var_q_128:
45242 case Intrinsic::x86_avx512_vpermi2var_q_256:
45243 case Intrinsic::x86_avx512_vpermi2var_q_512:
45244 case Intrinsic::x86_avx512_vpermi2var_qi_128:
45245 case Intrinsic::x86_avx512_vpermi2var_qi_256:
45246 case Intrinsic::x86_avx512_vpermi2var_qi_512:
45247 return false;
45248 }
45249 }
45251 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
45252}
45253
45255 const APInt &DemandedElts,
45256 APInt &UndefElts,
45257 const SelectionDAG &DAG,
45258 unsigned Depth) const {
45259 unsigned NumElts = DemandedElts.getBitWidth();
45260 unsigned Opc = Op.getOpcode();
45261
45262 switch (Opc) {
45263 case X86ISD::VBROADCAST:
45265 UndefElts = APInt::getZero(NumElts);
45266 return true;
45267 }
45268
45269 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
45270 DAG, Depth);
45271}
45272
45273// Helper to peek through bitops/trunc/setcc to determine size of source vector.
45274// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
45275static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
45276 bool AllowTruncate, unsigned Depth) {
45277 // Limit recursion.
45279 return false;
45280 switch (Src.getOpcode()) {
45281 case ISD::TRUNCATE:
45282 if (!AllowTruncate)
45283 return false;
45284 [[fallthrough]];
45285 case ISD::SETCC:
45286 return Src.getOperand(0).getValueSizeInBits() == Size;
45287 case ISD::FREEZE:
45288 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45289 Depth + 1);
45290 case ISD::AND:
45291 case ISD::XOR:
45292 case ISD::OR:
45293 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45294 Depth + 1) &&
45295 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45296 Depth + 1);
45297 case ISD::SELECT:
45298 case ISD::VSELECT:
45299 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
45300 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45301 Depth + 1) &&
45302 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate,
45303 Depth + 1);
45304 case ISD::BUILD_VECTOR:
45305 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
45306 ISD::isBuildVectorAllOnes(Src.getNode());
45307 }
45308 return false;
45309}
45310
45311// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
45312static unsigned getAltBitOpcode(unsigned Opcode) {
45313 switch(Opcode) {
45314 // clang-format off
45315 case ISD::AND: return X86ISD::FAND;
45316 case ISD::OR: return X86ISD::FOR;
45317 case ISD::XOR: return X86ISD::FXOR;
45318 case X86ISD::ANDNP: return X86ISD::FANDN;
45319 // clang-format on
45320 }
45321 llvm_unreachable("Unknown bitwise opcode");
45322}
45323
45324// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
45326 const SDLoc &DL) {
45327 EVT SrcVT = Src.getValueType();
45328 if (SrcVT != MVT::v4i1)
45329 return SDValue();
45330
45331 switch (Src.getOpcode()) {
45332 case ISD::SETCC:
45333 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
45334 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
45335 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
45336 SDValue Op0 = Src.getOperand(0);
45337 if (ISD::isNormalLoad(Op0.getNode()))
45338 return DAG.getBitcast(MVT::v4f32, Op0);
45339 if (Op0.getOpcode() == ISD::BITCAST &&
45340 Op0.getOperand(0).getValueType() == MVT::v4f32)
45341 return Op0.getOperand(0);
45342 }
45343 break;
45344 case ISD::AND:
45345 case ISD::XOR:
45346 case ISD::OR: {
45347 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
45348 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
45349 if (Op0 && Op1)
45350 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
45351 Op1);
45352 break;
45353 }
45354 }
45355 return SDValue();
45356}
45357
45358// Helper to push sign extension of vXi1 SETCC result through bitops.
45360 SDValue Src, const SDLoc &DL) {
45361 switch (Src.getOpcode()) {
45362 case ISD::SETCC:
45363 case ISD::FREEZE:
45364 case ISD::TRUNCATE:
45365 case ISD::BUILD_VECTOR:
45366 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45367 case ISD::AND:
45368 case ISD::XOR:
45369 case ISD::OR:
45370 return DAG.getNode(
45371 Src.getOpcode(), DL, SExtVT,
45372 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
45373 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
45374 case ISD::SELECT:
45375 case ISD::VSELECT:
45376 return DAG.getSelect(
45377 DL, SExtVT, Src.getOperand(0),
45378 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
45379 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
45380 }
45381 llvm_unreachable("Unexpected node type for vXi1 sign extension");
45382}
45383
45384// Try to match patterns such as
45385// (i16 bitcast (v16i1 x))
45386// ->
45387// (i16 movmsk (16i8 sext (v16i1 x)))
45388// before the illegal vector is scalarized on subtargets that don't have legal
45389// vxi1 types.
45391 const SDLoc &DL,
45392 const X86Subtarget &Subtarget) {
45393 EVT SrcVT = Src.getValueType();
45394 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
45395 return SDValue();
45396
45397 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
45398 // legalization destroys the v4i32 type.
45399 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
45400 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
45401 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
45402 DAG.getBitcast(MVT::v4f32, V));
45403 return DAG.getZExtOrTrunc(V, DL, VT);
45404 }
45405 }
45406
45407 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
45408 // movmskb even with avx512. This will be better than truncating to vXi1 and
45409 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
45410 // vpcmpeqb/vpcmpgtb.
45411 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
45412 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
45413 Src.getOperand(0).getValueType() == MVT::v32i8 ||
45414 Src.getOperand(0).getValueType() == MVT::v64i8);
45415
45416 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
45417 // directly with vpmovmskb/vmovmskps/vmovmskpd.
45418 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
45419 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
45420 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
45421 EVT CmpVT = Src.getOperand(0).getValueType();
45422 EVT EltVT = CmpVT.getVectorElementType();
45423 if (CmpVT.getSizeInBits() <= 256 &&
45424 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
45425 PreferMovMsk = true;
45426 }
45427
45428 // With AVX512 vxi1 types are legal and we prefer using k-regs.
45429 // MOVMSK is supported in SSE2 or later.
45430 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
45431 return SDValue();
45432
45433 // If the upper ops of a concatenation are undef, then try to bitcast the
45434 // lower op and extend.
45435 SmallVector<SDValue, 4> SubSrcOps;
45436 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
45437 SubSrcOps.size() >= 2) {
45438 SDValue LowerOp = SubSrcOps[0];
45439 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
45440 if (LowerOp.getOpcode() == ISD::SETCC &&
45441 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
45442 EVT SubVT = VT.getIntegerVT(
45443 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
45444 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
45445 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
45446 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
45447 }
45448 }
45449 }
45450
45451 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
45452 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
45453 // v8i16 and v16i16.
45454 // For these two cases, we can shuffle the upper element bytes to a
45455 // consecutive sequence at the start of the vector and treat the results as
45456 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
45457 // for v16i16 this is not the case, because the shuffle is expensive, so we
45458 // avoid sign-extending to this type entirely.
45459 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
45460 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
45461 MVT SExtVT;
45462 bool PropagateSExt = false;
45463 switch (SrcVT.getSimpleVT().SimpleTy) {
45464 default:
45465 return SDValue();
45466 case MVT::v2i1:
45467 SExtVT = MVT::v2i64;
45468 break;
45469 case MVT::v4i1:
45470 SExtVT = MVT::v4i32;
45471 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
45472 // sign-extend to a 256-bit operation to avoid truncation.
45473 if (Subtarget.hasAVX() &&
45474 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2(), 0)) {
45475 SExtVT = MVT::v4i64;
45476 PropagateSExt = true;
45477 }
45478 break;
45479 case MVT::v8i1:
45480 SExtVT = MVT::v8i16;
45481 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
45482 // sign-extend to a 256-bit operation to match the compare.
45483 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
45484 // 256-bit because the shuffle is cheaper than sign extending the result of
45485 // the compare.
45486 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true, 0) ||
45487 checkBitcastSrcVectorSize(Src, 512, true, 0))) {
45488 SExtVT = MVT::v8i32;
45489 PropagateSExt = true;
45490 }
45491 break;
45492 case MVT::v16i1:
45493 SExtVT = MVT::v16i8;
45494 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
45495 // it is not profitable to sign-extend to 256-bit because this will
45496 // require an extra cross-lane shuffle which is more expensive than
45497 // truncating the result of the compare to 128-bits.
45498 break;
45499 case MVT::v32i1:
45500 SExtVT = MVT::v32i8;
45501 break;
45502 case MVT::v64i1:
45503 // If we have AVX512F, but not AVX512BW and the input is truncated from
45504 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
45505 if (Subtarget.hasAVX512()) {
45506 if (Subtarget.hasBWI())
45507 return SDValue();
45508 SExtVT = MVT::v64i8;
45509 break;
45510 }
45511 // Split if this is a <64 x i8> comparison result.
45512 if (checkBitcastSrcVectorSize(Src, 512, false, 0)) {
45513 SExtVT = MVT::v64i8;
45514 break;
45515 }
45516 return SDValue();
45517 };
45518
45519 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
45520 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45521
45522 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
45523 V = getPMOVMSKB(DL, V, DAG, Subtarget);
45524 } else {
45525 if (SExtVT == MVT::v8i16) {
45526 V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
45527 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
45528 }
45529 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
45530 }
45531
45532 EVT IntVT =
45534 V = DAG.getZExtOrTrunc(V, DL, IntVT);
45535 return DAG.getBitcast(VT, V);
45536}
45537
45538// Convert a vXi1 constant build vector to the same width scalar integer.
45540 EVT SrcVT = Op.getValueType();
45541 assert(SrcVT.getVectorElementType() == MVT::i1 &&
45542 "Expected a vXi1 vector");
45544 "Expected a constant build vector");
45545
45546 APInt Imm(SrcVT.getVectorNumElements(), 0);
45547 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
45548 SDValue In = Op.getOperand(Idx);
45549 if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
45550 Imm.setBit(Idx);
45551 }
45552 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
45553 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
45554}
45555
45558 const X86Subtarget &Subtarget) {
45559 using namespace SDPatternMatch;
45560 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
45561
45562 if (!DCI.isBeforeLegalizeOps())
45563 return SDValue();
45564
45565 // Only do this if we have k-registers.
45566 if (!Subtarget.hasAVX512())
45567 return SDValue();
45568
45569 EVT DstVT = N->getValueType(0);
45570 SDValue Op = N->getOperand(0);
45571 EVT SrcVT = Op.getValueType();
45572
45573 // Make sure we have a bitcast between mask registers and a scalar type.
45574 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
45575 DstVT.isScalarInteger()) &&
45576 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
45577 SrcVT.isScalarInteger()))
45578 return SDValue();
45579
45580 SDValue LHS, RHS;
45581
45582 // Look for logic ops.
45584 return SDValue();
45585
45586 // If either operand was bitcast from DstVT, then perform logic with DstVT (at
45587 // least one of the getBitcast() will fold away).
45588 if (sd_match(LHS, m_OneUse(m_BitCast(m_SpecificVT(DstVT)))) ||
45590 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45591 DAG.getBitcast(DstVT, LHS), DAG.getBitcast(DstVT, RHS));
45592
45593 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
45594 // Most of these have to move a constant from the scalar domain anyway.
45597 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45598 DAG.getBitcast(DstVT, LHS), RHS);
45599 }
45600
45601 return SDValue();
45602}
45603
45605 const X86Subtarget &Subtarget) {
45606 SDLoc DL(BV);
45607 unsigned NumElts = BV->getNumOperands();
45608 SDValue Splat = BV->getSplatValue();
45609
45610 // Build MMX element from integer GPR or SSE float values.
45611 auto CreateMMXElement = [&](SDValue V) {
45612 if (V.isUndef())
45613 return DAG.getUNDEF(MVT::x86mmx);
45614 if (V.getValueType().isFloatingPoint()) {
45615 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
45616 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
45617 V = DAG.getBitcast(MVT::v2i64, V);
45618 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
45619 }
45620 V = DAG.getBitcast(MVT::i32, V);
45621 } else {
45622 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
45623 }
45624 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
45625 };
45626
45627 // Convert build vector ops to MMX data in the bottom elements.
45629
45630 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45631
45632 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
45633 if (Splat) {
45634 if (Splat.isUndef())
45635 return DAG.getUNDEF(MVT::x86mmx);
45636
45637 Splat = CreateMMXElement(Splat);
45638
45639 if (Subtarget.hasSSE1()) {
45640 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
45641 if (NumElts == 8)
45642 Splat = DAG.getNode(
45643 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45644 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
45645 TLI.getPointerTy(DAG.getDataLayout())),
45646 Splat, Splat);
45647
45648 // Use PSHUFW to repeat 16-bit elements.
45649 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
45650 return DAG.getNode(
45651 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45652 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
45653 TLI.getPointerTy(DAG.getDataLayout())),
45654 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
45655 }
45656 Ops.append(NumElts, Splat);
45657 } else {
45658 for (unsigned i = 0; i != NumElts; ++i)
45659 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
45660 }
45661
45662 // Use tree of PUNPCKLs to build up general MMX vector.
45663 while (Ops.size() > 1) {
45664 unsigned NumOps = Ops.size();
45665 unsigned IntrinOp =
45666 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
45667 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
45668 : Intrinsic::x86_mmx_punpcklbw));
45669 SDValue Intrin = DAG.getTargetConstant(
45670 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
45671 for (unsigned i = 0; i != NumOps; i += 2)
45672 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
45673 Ops[i], Ops[i + 1]);
45674 Ops.resize(NumOps / 2);
45675 }
45676
45677 return Ops[0];
45678}
45679
45680// Recursive function that attempts to find if a bool vector node was originally
45681// a vector/float/double that got truncated/extended/bitcast to/from a scalar
45682// integer. If so, replace the scalar ops with bool vector equivalents back down
45683// the chain.
45685 SelectionDAG &DAG,
45686 const X86Subtarget &Subtarget,
45687 unsigned Depth = 0) {
45689 return SDValue(); // Limit search depth.
45690
45691 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45692 unsigned Opc = V.getOpcode();
45693 switch (Opc) {
45694 case ISD::BITCAST: {
45695 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
45696 SDValue Src = V.getOperand(0);
45697 EVT SrcVT = Src.getValueType();
45698 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
45699 return DAG.getBitcast(VT, Src);
45700 break;
45701 }
45702 case ISD::Constant: {
45703 auto *C = cast<ConstantSDNode>(V);
45704 if (C->isZero())
45705 return DAG.getConstant(0, DL, VT);
45706 if (C->isAllOnes())
45707 return DAG.getAllOnesConstant(DL, VT);
45708 break;
45709 }
45710 case ISD::TRUNCATE: {
45711 // If we find a suitable source, a truncated scalar becomes a subvector.
45712 SDValue Src = V.getOperand(0);
45713 EVT NewSrcVT =
45714 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
45715 if (TLI.isTypeLegal(NewSrcVT))
45716 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45717 Subtarget, Depth + 1))
45718 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
45719 DAG.getVectorIdxConstant(0, DL));
45720 break;
45721 }
45722 case ISD::ANY_EXTEND:
45723 case ISD::ZERO_EXTEND: {
45724 // If we find a suitable source, an extended scalar becomes a subvector.
45725 SDValue Src = V.getOperand(0);
45726 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
45727 Src.getScalarValueSizeInBits());
45728 if (TLI.isTypeLegal(NewSrcVT))
45729 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45730 Subtarget, Depth + 1))
45731 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
45732 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
45733 : DAG.getConstant(0, DL, VT),
45734 N0, DAG.getVectorIdxConstant(0, DL));
45735 break;
45736 }
45737 case ISD::OR:
45738 case ISD::XOR: {
45739 // If we find suitable sources, we can just move the op to the vector
45740 // domain.
45741 if (SDValue N0 = combineBitcastToBoolVector(VT, V.getOperand(0), DL, DAG,
45742 Subtarget, Depth + 1))
45743 if (SDValue N1 = combineBitcastToBoolVector(VT, V.getOperand(1), DL, DAG,
45744 Subtarget, Depth + 1))
45745 return DAG.getNode(Opc, DL, VT, N0, N1);
45746 break;
45747 }
45748 case ISD::SHL: {
45749 // If we find a suitable source, a SHL becomes a KSHIFTL.
45750 SDValue Src0 = V.getOperand(0);
45751 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
45752 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
45753 break;
45754
45755 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
45756 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget,
45757 Depth + 1))
45758 return DAG.getNode(
45759 X86ISD::KSHIFTL, DL, VT, N0,
45760 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
45761 break;
45762 }
45763 }
45764
45765 // Does the inner bitcast already exist?
45766 if (Depth > 0)
45767 if (SDNode *Alt = DAG.getNodeIfExists(ISD::BITCAST, DAG.getVTList(VT), {V}))
45768 return SDValue(Alt, 0);
45769
45770 return SDValue();
45771}
45772
45775 const X86Subtarget &Subtarget) {
45776 SDValue N0 = N->getOperand(0);
45777 EVT VT = N->getValueType(0);
45778 EVT SrcVT = N0.getValueType();
45779 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45780
45781 // Try to match patterns such as
45782 // (i16 bitcast (v16i1 x))
45783 // ->
45784 // (i16 movmsk (16i8 sext (v16i1 x)))
45785 // before the setcc result is scalarized on subtargets that don't have legal
45786 // vxi1 types.
45787 if (DCI.isBeforeLegalize()) {
45788 SDLoc dl(N);
45789 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
45790 return V;
45791
45792 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45793 // type, widen both sides to avoid a trip through memory.
45794 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
45795 Subtarget.hasAVX512()) {
45796 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
45797 N0 = DAG.getBitcast(MVT::v8i1, N0);
45798 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
45799 DAG.getVectorIdxConstant(0, dl));
45800 }
45801
45802 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45803 // type, widen both sides to avoid a trip through memory.
45804 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
45805 Subtarget.hasAVX512()) {
45806 // Use zeros for the widening if we already have some zeroes. This can
45807 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
45808 // stream of this.
45809 // FIXME: It might make sense to detect a concat_vectors with a mix of
45810 // zeroes and undef and turn it into insert_subvector for i1 vectors as
45811 // a separate combine. What we can't do is canonicalize the operands of
45812 // such a concat or we'll get into a loop with SimplifyDemandedBits.
45813 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
45814 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
45815 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
45816 SrcVT = LastOp.getValueType();
45817 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45819 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
45820 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45821 N0 = DAG.getBitcast(MVT::i8, N0);
45822 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45823 }
45824 }
45825
45826 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45827 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
45828 Ops[0] = N0;
45829 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45830 N0 = DAG.getBitcast(MVT::i8, N0);
45831 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45832 }
45833 } else if (DCI.isAfterLegalizeDAG()) {
45834 // If we're bitcasting from iX to vXi1, see if the integer originally
45835 // began as a vXi1 and whether we can remove the bitcast entirely.
45836 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
45837 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
45838 if (SDValue V =
45839 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
45840 return V;
45841 }
45842 }
45843
45844 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
45845 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
45846 // due to insert_subvector legalization on KNL. By promoting the copy to i16
45847 // we can help with known bits propagation from the vXi1 domain to the
45848 // scalar domain.
45849 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
45850 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
45851 N0.getOperand(0).getValueType() == MVT::v16i1 &&
45853 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
45854 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
45855
45856 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
45857 // and the vbroadcast_load are both integer or both fp. In some cases this
45858 // will remove the bitcast entirely.
45859 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
45860 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
45861 auto *BCast = cast<MemIntrinsicSDNode>(N0);
45862 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
45863 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
45864 // Don't swap i8/i16 since don't have fp types that size.
45865 if (MemSize >= 32) {
45866 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
45867 : MVT::getIntegerVT(MemSize);
45868 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
45869 : MVT::getIntegerVT(SrcVTSize);
45870 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
45871
45872 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
45873 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
45874 SDValue ResNode =
45876 MemVT, BCast->getMemOperand());
45877 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
45878 return DAG.getBitcast(VT, ResNode);
45879 }
45880 }
45881
45882 // Attempt to peek through f16 bitcasted extractions hidden by truncation.
45883 if (VT == MVT::f16 && SrcVT == MVT::i16) {
45884 SDValue Src = peekThroughTruncates(N0);
45885 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45886 Src.getOperand(0).getValueSizeInBits() == 128 &&
45887 isNullConstant(Src.getOperand(1))) {
45888 SDLoc DL(N);
45889 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
45890 DAG.getBitcast(MVT::v8f16, Src.getOperand(0)),
45891 DAG.getVectorIdxConstant(0, DL));
45892 }
45893 }
45894
45895 // Since MMX types are special and don't usually play with other vector types,
45896 // it's better to handle them early to be sure we emit efficient code by
45897 // avoiding store-load conversions.
45898 if (VT == MVT::x86mmx) {
45899 // Detect MMX constant vectors.
45900 APInt UndefElts;
45901 SmallVector<APInt, 1> EltBits;
45902 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits,
45903 /*AllowWholeUndefs*/ true,
45904 /*AllowPartialUndefs*/ true)) {
45905 SDLoc DL(N0);
45906 // Handle zero-extension of i32 with MOVD.
45907 if (EltBits[0].countl_zero() >= 32)
45908 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
45909 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
45910 // Else, bitcast to a double.
45911 // TODO - investigate supporting sext 32-bit immediates on x86_64.
45912 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
45913 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
45914 }
45915
45916 // Detect bitcasts to x86mmx low word.
45917 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45918 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
45919 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
45920 bool LowUndef = true, AllUndefOrZero = true;
45921 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
45922 SDValue Op = N0.getOperand(i);
45923 LowUndef &= Op.isUndef() || (i >= e/2);
45924 AllUndefOrZero &= isNullConstantOrUndef(Op);
45925 }
45926 if (AllUndefOrZero) {
45927 SDValue N00 = N0.getOperand(0);
45928 SDLoc dl(N00);
45929 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
45930 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
45931 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
45932 }
45933 }
45934
45935 // Detect bitcasts of 64-bit build vectors and convert to a
45936 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
45937 // lowest element.
45938 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45939 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
45940 SrcVT == MVT::v8i8))
45941 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
45942
45943 // Detect bitcasts between element or subvector extraction to x86mmx.
45944 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
45946 isNullConstant(N0.getOperand(1))) {
45947 SDValue N00 = N0.getOperand(0);
45948 if (N00.getValueType().is128BitVector())
45949 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
45950 DAG.getBitcast(MVT::v2i64, N00));
45951 }
45952
45953 // Detect bitcasts from FP_TO_SINT to x86mmx.
45954 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
45955 SDLoc DL(N0);
45956 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
45957 DAG.getUNDEF(MVT::v2i32));
45958 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
45959 DAG.getBitcast(MVT::v2i64, Res));
45960 }
45961 }
45962
45963 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
45964 // most of these to scalar anyway.
45965 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
45966 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
45968 return combinevXi1ConstantToInteger(N0, DAG);
45969 }
45970
45971 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() &&
45972 VT.getVectorElementType() == MVT::i1) {
45973 if (auto *C = dyn_cast<ConstantSDNode>(N0)) {
45974 if (C->isAllOnes())
45975 return DAG.getConstant(1, SDLoc(N0), VT);
45976 if (C->isZero())
45977 return DAG.getConstant(0, SDLoc(N0), VT);
45978 }
45979 }
45980
45981 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
45982 // Turn it into a sign bit compare that produces a k-register. This avoids
45983 // a trip through a GPR.
45984 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
45985 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
45987 unsigned NumElts = VT.getVectorNumElements();
45988 SDValue Src = N0;
45989
45990 // Peek through truncate.
45991 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
45992 Src = N0.getOperand(0);
45993
45994 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
45995 SDValue MovmskIn = Src.getOperand(0);
45996 MVT MovmskVT = MovmskIn.getSimpleValueType();
45997 unsigned MovMskElts = MovmskVT.getVectorNumElements();
45998
45999 // We allow extra bits of the movmsk to be used since they are known zero.
46000 // We can't convert a VPMOVMSKB without avx512bw.
46001 if (MovMskElts <= NumElts &&
46002 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
46003 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
46004 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
46005 SDLoc dl(N);
46006 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
46007 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
46008 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
46009 if (EVT(CmpVT) == VT)
46010 return Cmp;
46011
46012 // Pad with zeroes up to original VT to replace the zeroes that were
46013 // being used from the MOVMSK.
46014 unsigned NumConcats = NumElts / MovMskElts;
46015 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
46016 Ops[0] = Cmp;
46017 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
46018 }
46019 }
46020 }
46021
46022 // Try to remove bitcasts from input and output of mask arithmetic to
46023 // remove GPR<->K-register crossings.
46024 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
46025 return V;
46026
46027 // bitcast(v1Ty insert_vector_elt(X, Y, 0)) --> Y
46028 if (N0.getOpcode() == ISD::INSERT_VECTOR_ELT && SrcVT.getScalarType() == VT &&
46029 SrcVT.getVectorNumElements() == 1)
46030 return N0.getOperand(1);
46031
46032 // Convert a bitcasted integer logic operation that has one bitcasted
46033 // floating-point operand into a floating-point logic operation. This may
46034 // create a load of a constant, but that is cheaper than materializing the
46035 // constant in an integer register and transferring it to an SSE register or
46036 // transferring the SSE operand to integer register and back.
46037 unsigned FPOpcode;
46038 switch (N0.getOpcode()) {
46039 // clang-format off
46040 case ISD::AND: FPOpcode = X86ISD::FAND; break;
46041 case ISD::OR: FPOpcode = X86ISD::FOR; break;
46042 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
46043 default: return SDValue();
46044 // clang-format on
46045 }
46046
46047 // Check if we have a bitcast from another integer type as well.
46048 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
46049 (Subtarget.hasSSE2() && VT == MVT::f64) ||
46050 (Subtarget.hasFP16() && VT == MVT::f16) ||
46051 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
46052 TLI.isTypeLegal(VT))))
46053 return SDValue();
46054
46055 SDValue LogicOp0 = N0.getOperand(0);
46056 SDValue LogicOp1 = N0.getOperand(1);
46057 SDLoc DL0(N0);
46058
46059 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
46060 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
46061 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
46062 LogicOp0.getOperand(0).getValueType() == VT &&
46063 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
46064 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
46065 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46066 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
46067 }
46068 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
46069 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
46070 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
46071 LogicOp1.getOperand(0).getValueType() == VT &&
46072 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
46073 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
46074 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46075 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
46076 }
46077
46078 return SDValue();
46079}
46080
46081// (mul (zext a), (sext, b))
46082static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
46083 SDValue &Op1) {
46084 Op0 = Mul.getOperand(0);
46085 Op1 = Mul.getOperand(1);
46086
46087 // The operand1 should be signed extend
46088 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
46089 std::swap(Op0, Op1);
46090
46091 auto IsFreeTruncation = [](SDValue &Op) -> bool {
46092 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
46093 Op.getOpcode() == ISD::SIGN_EXTEND) &&
46094 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
46095 return true;
46096
46097 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
46098 return (BV && BV->isConstant());
46099 };
46100
46101 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
46102 // value, we need to check Op0 is zero extended value. Op1 should be signed
46103 // value, so we just check the signed bits.
46104 if ((IsFreeTruncation(Op0) &&
46105 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
46106 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
46107 return true;
46108
46109 return false;
46110}
46111
46113 unsigned &LogBias, const SDLoc &DL,
46114 const X86Subtarget &Subtarget) {
46115 // Extend or truncate to MVT::i8 first.
46116 MVT Vi8VT =
46117 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
46118 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
46119 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
46120
46121 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
46122 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
46123 // The src A, B element type is i8, but the dst C element type is i32.
46124 // When we calculate the reduce stage, we use src vector type vXi8 for it
46125 // so we need logbias 2 to avoid extra 2 stages.
46126 LogBias = 2;
46127
46128 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
46129 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
46130 RegSize = std::max(512u, RegSize);
46131
46132 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
46133 // fill in the missing vector elements with 0.
46134 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
46135 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
46136 Ops[0] = LHS;
46137 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46138 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46139 Ops[0] = RHS;
46140 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46141
46142 // Actually build the DotProduct, split as 256/512 bits for
46143 // AVXVNNI/AVX512VNNI.
46144 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46146 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
46147 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
46148 };
46149 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
46150 SDValue Zero = DAG.getConstant(0, DL, DpVT);
46151
46152 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
46153 DpBuilder, false);
46154}
46155
46156// Create a PSADBW given two sources representable as zexts of vXi8.
46158 const SDLoc &DL, const X86Subtarget &Subtarget) {
46159 // Find the appropriate width for the PSADBW.
46160 EVT DstVT = N0.getValueType();
46161 EVT SrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8,
46162 DstVT.getVectorElementCount());
46163 unsigned RegSize = std::max(128u, (unsigned)SrcVT.getSizeInBits());
46164
46165 // Widen the vXi8 vectors, padding with zero vector elements.
46166 unsigned NumConcat = RegSize / SrcVT.getSizeInBits();
46167 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, SrcVT));
46168 Ops[0] = DAG.getZExtOrTrunc(N0, DL, SrcVT);
46169 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46170 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46171 Ops[0] = DAG.getZExtOrTrunc(N1, DL, SrcVT);
46172 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46173
46174 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
46175 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46177 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
46178 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
46179 };
46180 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
46181 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {SadOp0, SadOp1},
46182 PSADBWBuilder);
46183}
46184
46185// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
46186// PHMINPOSUW.
46188 const X86Subtarget &Subtarget) {
46189 // Bail without SSE41.
46190 if (!Subtarget.hasSSE41())
46191 return SDValue();
46192
46193 EVT ExtractVT = Extract->getValueType(0);
46194 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
46195 return SDValue();
46196
46197 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
46198 ISD::NodeType BinOp;
46199 SDValue Src = DAG.matchBinOpReduction(
46200 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
46201 if (!Src)
46202 return SDValue();
46203
46204 EVT SrcVT = Src.getValueType();
46205 EVT SrcSVT = SrcVT.getScalarType();
46206 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
46207 return SDValue();
46208
46209 SDLoc DL(Extract);
46210 SDValue MinPos = Src;
46211
46212 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
46213 while (SrcVT.getSizeInBits() > 128) {
46214 SDValue Lo, Hi;
46215 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
46216 SrcVT = Lo.getValueType();
46217 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
46218 }
46219 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
46220 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
46221 "Unexpected value type");
46222
46223 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
46224 // to flip the value accordingly.
46225 SDValue Mask;
46226 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
46227 if (BinOp == ISD::SMAX)
46228 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
46229 else if (BinOp == ISD::SMIN)
46230 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
46231 else if (BinOp == ISD::UMAX)
46232 Mask = DAG.getAllOnesConstant(DL, SrcVT);
46233
46234 if (Mask)
46235 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46236
46237 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
46238 // shuffling each upper element down and insert zeros. This means that the
46239 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
46240 // ready for the PHMINPOS.
46241 if (ExtractVT == MVT::i8) {
46243 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
46244 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
46245 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
46246 }
46247
46248 // Perform the PHMINPOS on a v8i16 vector,
46249 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
46250 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
46251 MinPos = DAG.getBitcast(SrcVT, MinPos);
46252
46253 if (Mask)
46254 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46255
46256 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
46257 DAG.getVectorIdxConstant(0, DL));
46258}
46259
46260// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
46262 const X86Subtarget &Subtarget) {
46263 // Bail without SSE2.
46264 if (!Subtarget.hasSSE2())
46265 return SDValue();
46266
46267 EVT ExtractVT = Extract->getValueType(0);
46268 unsigned BitWidth = ExtractVT.getSizeInBits();
46269 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
46270 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
46271 return SDValue();
46272
46273 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
46274 ISD::NodeType BinOp;
46275 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
46276 if (!Match && ExtractVT == MVT::i1)
46277 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
46278 if (!Match)
46279 return SDValue();
46280
46281 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
46282 // which we can't support here for now.
46283 if (Match.getScalarValueSizeInBits() != BitWidth)
46284 return SDValue();
46285
46286 SDValue Movmsk;
46287 SDLoc DL(Extract);
46288 EVT MatchVT = Match.getValueType();
46289 unsigned NumElts = MatchVT.getVectorNumElements();
46290 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
46291 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46292 LLVMContext &Ctx = *DAG.getContext();
46293
46294 if (ExtractVT == MVT::i1) {
46295 // Special case for (pre-legalization) vXi1 reductions.
46296 if (NumElts > 64 || !isPowerOf2_32(NumElts))
46297 return SDValue();
46298 if (Match.getOpcode() == ISD::SETCC) {
46299 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
46300 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
46301 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
46302 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
46303 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
46304 X86::CondCode X86CC;
46305 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
46306 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
46307 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
46308 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
46309 DAG, X86CC))
46310 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
46311 getSETCC(X86CC, V, DL, DAG));
46312 }
46313 }
46314 if (TLI.isTypeLegal(MatchVT)) {
46315 // If this is a legal AVX512 predicate type then we can just bitcast.
46316 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46317 Movmsk = DAG.getBitcast(MovmskVT, Match);
46318 } else {
46319 // Use combineBitcastvxi1 to create the MOVMSK.
46320 while (NumElts > MaxElts) {
46321 SDValue Lo, Hi;
46322 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46323 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46324 NumElts /= 2;
46325 }
46326 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46327 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
46328 }
46329 if (!Movmsk)
46330 return SDValue();
46331 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
46332 } else {
46333 // FIXME: Better handling of k-registers or 512-bit vectors?
46334 unsigned MatchSizeInBits = Match.getValueSizeInBits();
46335 if (!(MatchSizeInBits == 128 ||
46336 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
46337 return SDValue();
46338
46339 // Make sure this isn't a vector of 1 element. The perf win from using
46340 // MOVMSK diminishes with less elements in the reduction, but it is
46341 // generally better to get the comparison over to the GPRs as soon as
46342 // possible to reduce the number of vector ops.
46343 if (Match.getValueType().getVectorNumElements() < 2)
46344 return SDValue();
46345
46346 // Check that we are extracting a reduction of all sign bits.
46347 if (DAG.ComputeNumSignBits(Match) != BitWidth)
46348 return SDValue();
46349
46350 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
46351 SDValue Lo, Hi;
46352 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46353 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46354 MatchSizeInBits = Match.getValueSizeInBits();
46355 }
46356
46357 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
46358 MVT MaskSrcVT;
46359 if (64 == BitWidth || 32 == BitWidth)
46361 MatchSizeInBits / BitWidth);
46362 else
46363 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
46364
46365 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
46366 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
46367 NumElts = MaskSrcVT.getVectorNumElements();
46368 }
46369 assert((NumElts <= 32 || NumElts == 64) &&
46370 "Not expecting more than 64 elements");
46371
46372 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
46373 if (BinOp == ISD::XOR) {
46374 // parity -> (PARITY(MOVMSK X))
46375 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
46376 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
46377 }
46378
46379 SDValue CmpC;
46380 ISD::CondCode CondCode;
46381 if (BinOp == ISD::OR) {
46382 // any_of -> MOVMSK != 0
46383 CmpC = DAG.getConstant(0, DL, CmpVT);
46384 CondCode = ISD::CondCode::SETNE;
46385 } else {
46386 // all_of -> MOVMSK == ((1 << NumElts) - 1)
46387 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
46388 DL, CmpVT);
46389 CondCode = ISD::CondCode::SETEQ;
46390 }
46391
46392 // The setcc produces an i8 of 0/1, so extend that to the result width and
46393 // negate to get the final 0/-1 mask value.
46394 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
46395 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
46396 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
46397 return DAG.getNegative(Zext, DL, ExtractVT);
46398}
46399
46401 const X86Subtarget &Subtarget) {
46402 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
46403 return SDValue();
46404
46405 EVT ExtractVT = Extract->getValueType(0);
46406 // Verify the type we're extracting is i32, as the output element type of
46407 // vpdpbusd is i32.
46408 if (ExtractVT != MVT::i32)
46409 return SDValue();
46410
46411 EVT VT = Extract->getOperand(0).getValueType();
46413 return SDValue();
46414
46415 // Match shuffle + add pyramid.
46416 ISD::NodeType BinOp;
46417 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46418
46419 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
46420 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
46421 // before adding into the accumulator.
46422 // TODO:
46423 // We also need to verify that the multiply has at least 2x the number of bits
46424 // of the input. We shouldn't match
46425 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
46426 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
46427 // Root = Root.getOperand(0);
46428
46429 // If there was a match, we want Root to be a mul.
46430 if (!Root || Root.getOpcode() != ISD::MUL)
46431 return SDValue();
46432
46433 // Check whether we have an extend and mul pattern
46434 SDValue LHS, RHS;
46435 if (!detectExtMul(DAG, Root, LHS, RHS))
46436 return SDValue();
46437
46438 // Create the dot product instruction.
46439 SDLoc DL(Extract);
46440 unsigned StageBias;
46441 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
46442
46443 // If the original vector was wider than 4 elements, sum over the results
46444 // in the DP vector.
46445 unsigned Stages = Log2_32(VT.getVectorNumElements());
46446 EVT DpVT = DP.getValueType();
46447
46448 if (Stages > StageBias) {
46449 unsigned DpElems = DpVT.getVectorNumElements();
46450
46451 for (unsigned i = Stages - StageBias; i > 0; --i) {
46452 SmallVector<int, 16> Mask(DpElems, -1);
46453 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46454 Mask[j] = MaskEnd + j;
46455
46456 SDValue Shuffle =
46457 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
46458 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
46459 }
46460 }
46461
46462 // Return the lowest ExtractSizeInBits bits.
46463 EVT ResVT =
46464 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46465 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
46466 DP = DAG.getBitcast(ResVT, DP);
46467 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
46468 Extract->getOperand(1));
46469}
46470
46472 const X86Subtarget &Subtarget) {
46473 using namespace SDPatternMatch;
46474
46475 // PSADBW is only supported on SSE2 and up.
46476 if (!Subtarget.hasSSE2())
46477 return SDValue();
46478
46479 EVT ExtractVT = Extract->getValueType(0);
46480 if (ExtractVT != MVT::i8 && ExtractVT != MVT::i16 && ExtractVT != MVT::i32 &&
46481 ExtractVT != MVT::i64)
46482 return SDValue();
46483
46484 EVT VT = Extract->getOperand(0).getValueType();
46486 return SDValue();
46487
46488 // Match shuffle + add pyramid.
46489 ISD::NodeType BinOp;
46490 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46491 if (!Root)
46492 return SDValue();
46493
46494 // The operand is expected to be zero extended from i8.
46495 // In order to convert to i64 and above, additional any/zero/sign
46496 // extend is expected.
46497 // The zero extend from 32 bit has no mathematical effect on the result.
46498 // Also the sign extend is basically zero extend
46499 // (extends the sign bit which is zero).
46500 // So it is correct to skip the sign/zero extend instruction.
46501 if (Root.getOpcode() == ISD::SIGN_EXTEND ||
46502 Root.getOpcode() == ISD::ZERO_EXTEND ||
46503 Root.getOpcode() == ISD::ANY_EXTEND)
46504 Root = Root.getOperand(0);
46505
46506 // Check whether we have an vXi8 abdu pattern.
46507 // TODO: Just match ISD::ABDU once the DAG is topological sorted.
46508 SDValue Src0, Src1;
46509 if (!sd_match(
46510 Root,
46511 m_AnyOf(
46513 MVT::i8, m_c_BinOp(ISD::ABDU, m_Value(Src0), m_Value(Src1))),
46515 MVT::i8, m_Sub(m_UMax(m_Value(Src0), m_Value(Src1)),
46516 m_UMin(m_Deferred(Src0), m_Deferred(Src1)))),
46517 m_Abs(
46518 m_Sub(m_AllOf(m_Value(Src0),
46520 m_AllOf(m_Value(Src1),
46521 m_ZExt(m_SpecificVectorElementVT(MVT::i8))))))))
46522 return SDValue();
46523
46524 // Create the SAD instruction.
46525 SDLoc DL(Extract);
46526 SDValue SAD = createPSADBW(DAG, Src0, Src1, DL, Subtarget);
46527
46528 // If the original vector was wider than 8 elements, sum over the results
46529 // in the SAD vector.
46530 unsigned Stages = Log2_32(VT.getVectorNumElements());
46531 EVT SadVT = SAD.getValueType();
46532 if (Stages > 3) {
46533 unsigned SadElems = SadVT.getVectorNumElements();
46534
46535 for(unsigned i = Stages - 3; i > 0; --i) {
46536 SmallVector<int, 16> Mask(SadElems, -1);
46537 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46538 Mask[j] = MaskEnd + j;
46539
46540 SDValue Shuffle =
46541 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
46542 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
46543 }
46544 }
46545
46546 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
46547 // Return the lowest ExtractSizeInBits bits.
46548 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46549 SadVT.getSizeInBits() / ExtractSizeInBits);
46550 SAD = DAG.getBitcast(ResVT, SAD);
46551 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
46552 Extract->getOperand(1));
46553}
46554
46555// If this extract is from a loaded vector value and will be used as an
46556// integer, that requires a potentially expensive XMM -> GPR transfer.
46557// Additionally, if we can convert to a scalar integer load, that will likely
46558// be folded into a subsequent integer op.
46559// Note: SrcVec might not have a VecVT type, but it must be the same size.
46560// Note: Unlike the related fold for this in DAGCombiner, this is not limited
46561// to a single-use of the loaded vector. For the reasons above, we
46562// expect this to be profitable even if it creates an extra load.
46563static SDValue
46565 const SDLoc &dl, SelectionDAG &DAG,
46567 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46568 "Only EXTRACT_VECTOR_ELT supported so far");
46569
46570 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46571 EVT VT = N->getValueType(0);
46572
46573 bool LikelyUsedAsVector = any_of(N->users(), [](SDNode *Use) {
46574 return Use->getOpcode() == ISD::STORE ||
46575 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
46576 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
46577 });
46578
46579 auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
46580 if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
46581 VecVT.getVectorElementType() == VT &&
46582 VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
46583 DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
46584 SDValue NewPtr = TLI.getVectorElementPointer(
46585 DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
46586 unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
46587 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
46588 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
46589 SDValue Load =
46590 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
46591 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
46592 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
46593 return Load;
46594 }
46595
46596 return SDValue();
46597}
46598
46599// Attempt to peek through a target shuffle and extract the scalar from the
46600// source.
46603 const X86Subtarget &Subtarget) {
46604 if (DCI.isBeforeLegalizeOps())
46605 return SDValue();
46606
46607 SDLoc dl(N);
46608 SDValue Src = N->getOperand(0);
46609 SDValue Idx = N->getOperand(1);
46610
46611 EVT VT = N->getValueType(0);
46612 EVT SrcVT = Src.getValueType();
46613 EVT SrcSVT = SrcVT.getVectorElementType();
46614 unsigned SrcEltBits = SrcSVT.getSizeInBits();
46615 unsigned NumSrcElts = SrcVT.getVectorNumElements();
46616
46617 // Don't attempt this for boolean mask vectors or unknown extraction indices.
46618 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
46619 return SDValue();
46620
46621 const APInt &IdxC = N->getConstantOperandAPInt(1);
46622 if (IdxC.uge(NumSrcElts))
46623 return SDValue();
46624
46625 SDValue SrcBC = peekThroughBitcasts(Src);
46626
46627 // Handle extract(bitcast(broadcast(scalar_value))).
46628 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
46629 SDValue SrcOp = SrcBC.getOperand(0);
46630 EVT SrcOpVT = SrcOp.getValueType();
46631 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
46632 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
46633 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
46634 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
46635 // TODO support non-zero offsets.
46636 if (Offset == 0) {
46637 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
46638 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
46639 return SrcOp;
46640 }
46641 }
46642 }
46643
46644 // If we're extracting a single element from a broadcast load and there are
46645 // no other users, just create a single load.
46647 SrcBC.hasOneUse()) {
46648 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
46649 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
46650 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
46651 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
46652 SDValue Load =
46653 DAG.getLoad(VT, dl, MemIntr->getChain(), MemIntr->getBasePtr(),
46654 MemIntr->getPointerInfo(), MemIntr->getBaseAlign(),
46655 MemIntr->getMemOperand()->getFlags());
46656 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
46657 return Load;
46658 }
46659 }
46660
46661 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
46662 // TODO: Move to DAGCombine?
46663 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
46664 SrcBC.getValueType().isInteger() &&
46665 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
46666 SrcBC.getScalarValueSizeInBits() ==
46667 SrcBC.getOperand(0).getValueSizeInBits()) {
46668 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
46669 if (IdxC.ult(Scale)) {
46670 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
46671 SDValue Scl = SrcBC.getOperand(0);
46672 EVT SclVT = Scl.getValueType();
46673 if (Offset) {
46674 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
46675 DAG.getShiftAmountConstant(Offset, SclVT, dl));
46676 }
46677 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
46678 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
46679 return Scl;
46680 }
46681 }
46682
46683 // Handle extract(truncate(x)) for 0'th index.
46684 // TODO: Treat this as a faux shuffle?
46685 // TODO: When can we use this for general indices?
46686 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
46687 (SrcVT.getSizeInBits() % 128) == 0) {
46688 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
46689 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
46690 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
46691 Idx);
46692 }
46693
46694 // We can only legally extract other elements from 128-bit vectors and in
46695 // certain circumstances, depending on SSE-level.
46696 // TODO: Investigate float/double extraction if it will be just stored.
46697 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
46698 unsigned Idx) {
46699 EVT VecSVT = VecVT.getScalarType();
46700 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
46701 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
46702 VecSVT == MVT::i64)) {
46703 unsigned EltSizeInBits = VecSVT.getSizeInBits();
46704 unsigned NumEltsPerLane = 128 / EltSizeInBits;
46705 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
46706 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
46707 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
46708 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
46709 Idx &= (NumEltsPerLane - 1);
46710 }
46711 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
46712 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
46713 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
46714 DAG.getBitcast(VecVT, Vec),
46715 DAG.getVectorIdxConstant(Idx, dl));
46716 }
46717 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
46718 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
46719 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
46720 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
46721 DAG.getTargetConstant(Idx, dl, MVT::i8));
46722 }
46723 return SDValue();
46724 };
46725
46726 // Resolve the target shuffle inputs and mask.
46729 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
46730 return SDValue();
46731
46732 // Shuffle inputs must be the same size as the result.
46733 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
46734 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
46735 }))
46736 return SDValue();
46737
46738 // Attempt to narrow/widen the shuffle mask to the correct size.
46739 if (Mask.size() != NumSrcElts) {
46740 if ((NumSrcElts % Mask.size()) == 0) {
46741 SmallVector<int, 16> ScaledMask;
46742 int Scale = NumSrcElts / Mask.size();
46743 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
46744 Mask = std::move(ScaledMask);
46745 } else if ((Mask.size() % NumSrcElts) == 0) {
46746 // Simplify Mask based on demanded element.
46747 int ExtractIdx = (int)IdxC.getZExtValue();
46748 int Scale = Mask.size() / NumSrcElts;
46749 int Lo = Scale * ExtractIdx;
46750 int Hi = Scale * (ExtractIdx + 1);
46751 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
46752 if (i < Lo || Hi <= i)
46753 Mask[i] = SM_SentinelUndef;
46754
46755 SmallVector<int, 16> WidenedMask;
46756 while (Mask.size() > NumSrcElts &&
46757 canWidenShuffleElements(Mask, WidenedMask))
46758 Mask = std::move(WidenedMask);
46759 }
46760 }
46761
46762 // If narrowing/widening failed, see if we can extract+zero-extend.
46763 int ExtractIdx;
46764 EVT ExtractVT;
46765 if (Mask.size() == NumSrcElts) {
46766 ExtractIdx = Mask[IdxC.getZExtValue()];
46767 ExtractVT = SrcVT;
46768 } else {
46769 unsigned Scale = Mask.size() / NumSrcElts;
46770 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
46771 return SDValue();
46772 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
46773 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
46774 return SDValue();
46775 ExtractIdx = Mask[ScaledIdx];
46776 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
46777 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
46778 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
46779 "Failed to widen vector type");
46780 }
46781
46782 // If the shuffle source element is undef/zero then we can just accept it.
46783 if (ExtractIdx == SM_SentinelUndef)
46784 return DAG.getUNDEF(VT);
46785
46786 if (ExtractIdx == SM_SentinelZero)
46787 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
46788 : DAG.getConstant(0, dl, VT);
46789
46790 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
46791 ExtractIdx = ExtractIdx % Mask.size();
46792 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
46793 return DAG.getZExtOrTrunc(V, dl, VT);
46794
46795 if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
46797 N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
46798 return V;
46799
46800 return SDValue();
46801}
46802
46803/// Extracting a scalar FP value from vector element 0 is free, so extract each
46804/// operand first, then perform the math as a scalar op.
46806 const X86Subtarget &Subtarget,
46808 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
46809 SDValue Vec = ExtElt->getOperand(0);
46810 SDValue Index = ExtElt->getOperand(1);
46811 EVT VT = ExtElt->getValueType(0);
46812 EVT VecVT = Vec.getValueType();
46813
46814 // TODO: If this is a unary/expensive/expand op, allow extraction from a
46815 // non-zero element because the shuffle+scalar op will be cheaper?
46816 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
46817 return SDValue();
46818
46819 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
46820 // extract, the condition code), so deal with those as a special-case.
46821 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
46822 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
46823 if (OpVT != MVT::f32 && OpVT != MVT::f64)
46824 return SDValue();
46825
46826 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
46827 SDLoc DL(ExtElt);
46828 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46829 Vec.getOperand(0), Index);
46830 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46831 Vec.getOperand(1), Index);
46832 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
46833 }
46834
46835 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
46836 VT != MVT::f64)
46837 return SDValue();
46838
46839 // Vector FP selects don't fit the pattern of FP math ops (because the
46840 // condition has a different type and we have to change the opcode), so deal
46841 // with those here.
46842 // FIXME: This is restricted to pre type legalization. If we loosen this we
46843 // need to convert vector bool to a scalar bool.
46844 if (DCI.isBeforeLegalize() && Vec.getOpcode() == ISD::VSELECT &&
46845 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
46846 Vec.getOperand(0).getOperand(0).getValueType() == VecVT &&
46847 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1) {
46848 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
46849 SDLoc DL(ExtElt);
46852 Vec.getOperand(0), Index);
46853 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46854 Vec.getOperand(1), Index);
46855 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46856 Vec.getOperand(2), Index);
46857 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
46858 }
46859
46860 // TODO: This switch could include FNEG and the x86-specific FP logic ops
46861 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
46862 // missed load folding and fma+fneg combining.
46863 switch (Vec.getOpcode()) {
46864 case ISD::FMA: // Begin 3 operands
46865 case ISD::FMAD:
46866 case ISD::FADD: // Begin 2 operands
46867 case ISD::FSUB:
46868 case ISD::FMUL:
46869 case ISD::FDIV:
46870 case ISD::FREM:
46871 case ISD::FCOPYSIGN:
46872 case ISD::FMINNUM:
46873 case ISD::FMAXNUM:
46874 case ISD::FMINNUM_IEEE:
46875 case ISD::FMAXNUM_IEEE:
46876 case ISD::FMAXIMUM:
46877 case ISD::FMINIMUM:
46878 case ISD::FMAXIMUMNUM:
46879 case ISD::FMINIMUMNUM:
46880 case X86ISD::FMAX:
46881 case X86ISD::FMIN:
46882 case ISD::FABS: // Begin 1 operand
46883 case ISD::FSQRT:
46884 case ISD::FRINT:
46885 case ISD::FCEIL:
46886 case ISD::FTRUNC:
46887 case ISD::FNEARBYINT:
46888 case ISD::FROUNDEVEN:
46889 case ISD::FROUND:
46890 case ISD::FFLOOR:
46891 case X86ISD::FRCP:
46892 case X86ISD::FRSQRT: {
46893 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
46894 SDLoc DL(ExtElt);
46896 for (SDValue Op : Vec->ops())
46897 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
46898 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
46899 }
46900 default:
46901 return SDValue();
46902 }
46903 llvm_unreachable("All opcodes should return within switch");
46904}
46905
46906/// Try to convert a vector reduction sequence composed of binops and shuffles
46907/// into horizontal ops.
46909 const X86Subtarget &Subtarget) {
46910 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
46911
46912 // We need at least SSE2 to anything here.
46913 if (!Subtarget.hasSSE2())
46914 return SDValue();
46915
46917 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
46918 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
46919 if (!Rdx)
46920 return SDValue();
46921
46922 SDValue Index = ExtElt->getOperand(1);
46923 assert(isNullConstant(Index) &&
46924 "Reduction doesn't end in an extract from index 0");
46925
46926 EVT VT = ExtElt->getValueType(0);
46927 EVT VecVT = Rdx.getValueType();
46928 if (VecVT.getScalarType() != VT)
46929 return SDValue();
46930
46931 SDLoc DL(ExtElt);
46932 unsigned NumElts = VecVT.getVectorNumElements();
46933 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
46934
46935 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
46936 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
46937 if (V.getValueType() == MVT::v4i8) {
46938 if (ZeroExtend && Subtarget.hasSSE41()) {
46939 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
46940 DAG.getConstant(0, DL, MVT::v4i32),
46941 DAG.getBitcast(MVT::i32, V),
46942 DAG.getVectorIdxConstant(0, DL));
46943 return DAG.getBitcast(MVT::v16i8, V);
46944 }
46945 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
46946 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
46947 : DAG.getUNDEF(MVT::v4i8));
46948 }
46949 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
46950 DAG.getUNDEF(MVT::v8i8));
46951 };
46952
46953 // vXi8 mul reduction - promote to vXi16 mul reduction.
46954 if (Opc == ISD::MUL) {
46955 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
46956 return SDValue();
46957 if (VecVT.getSizeInBits() >= 128) {
46958 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
46959 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
46960 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
46961 Lo = DAG.getBitcast(WideVT, Lo);
46962 Hi = DAG.getBitcast(WideVT, Hi);
46963 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
46964 while (Rdx.getValueSizeInBits() > 128) {
46965 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
46966 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
46967 }
46968 } else {
46969 Rdx = WidenToV16I8(Rdx, false);
46970 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
46971 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
46972 }
46973 if (NumElts >= 8)
46974 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
46975 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
46976 {4, 5, 6, 7, -1, -1, -1, -1}));
46977 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
46978 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
46979 {2, 3, -1, -1, -1, -1, -1, -1}));
46980 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
46981 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
46982 {1, -1, -1, -1, -1, -1, -1, -1}));
46983 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
46984 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46985 }
46986
46987 // vXi8 add reduction - sub 128-bit vector.
46988 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
46989 Rdx = WidenToV16I8(Rdx, true);
46990 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
46991 DAG.getConstant(0, DL, MVT::v16i8));
46992 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
46993 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46994 }
46995
46996 // Must be a >=128-bit vector with pow2 elements.
46997 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
46998 return SDValue();
46999
47000 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
47001 if (VT == MVT::i8) {
47002 while (Rdx.getValueSizeInBits() > 128) {
47003 SDValue Lo, Hi;
47004 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47005 VecVT = Lo.getValueType();
47006 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47007 }
47008 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
47009
47011 MVT::v16i8, DL, Rdx, Rdx,
47012 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
47013 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
47014 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
47015 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
47016 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47017 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47018 }
47019
47020 // See if we can use vXi8 PSADBW add reduction for larger zext types.
47021 // If the source vector values are 0-255, then we can use PSADBW to
47022 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
47023 // TODO: See if its worth avoiding vXi16/i32 truncations?
47024 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
47025 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
47026 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
47027 Subtarget.hasAVX512())) {
47028 if (Rdx.getValueType() == MVT::v8i16) {
47029 Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
47030 DAG.getUNDEF(MVT::v8i16));
47031 } else {
47032 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
47033 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
47034 if (ByteVT.getSizeInBits() < 128)
47035 Rdx = WidenToV16I8(Rdx, true);
47036 }
47037
47038 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
47039 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47041 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
47042 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
47043 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
47044 };
47045 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
47046 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
47047
47048 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
47049 while (Rdx.getValueSizeInBits() > 128) {
47050 SDValue Lo, Hi;
47051 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47052 VecVT = Lo.getValueType();
47053 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47054 }
47055 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
47056
47057 if (NumElts > 8) {
47058 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
47059 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
47060 }
47061
47062 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
47063 Rdx = DAG.getBitcast(VecVT, Rdx);
47064 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47065 }
47066
47067 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
47068 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
47069 return SDValue();
47070
47071 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
47072
47073 // 256-bit horizontal instructions operate on 128-bit chunks rather than
47074 // across the whole vector, so we need an extract + hop preliminary stage.
47075 // This is the only step where the operands of the hop are not the same value.
47076 // TODO: We could extend this to handle 512-bit or even longer vectors.
47077 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
47078 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
47079 unsigned NumElts = VecVT.getVectorNumElements();
47080 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
47081 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
47082 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
47083 VecVT = Rdx.getValueType();
47084 }
47085 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
47086 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
47087 return SDValue();
47088
47089 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
47090 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
47091 for (unsigned i = 0; i != ReductionSteps; ++i)
47092 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
47093
47094 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47095}
47096
47097/// Detect vector gather/scatter index generation and convert it from being a
47098/// bunch of shuffles and extracts into a somewhat faster sequence.
47099/// For i686, the best sequence is apparently storing the value and loading
47100/// scalars back, while for x64 we should use 64-bit extracts and shifts.
47103 const X86Subtarget &Subtarget) {
47104 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
47105 return NewOp;
47106
47107 SDValue InputVector = N->getOperand(0);
47108 SDValue EltIdx = N->getOperand(1);
47109 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
47110
47111 EVT SrcVT = InputVector.getValueType();
47112 EVT VT = N->getValueType(0);
47113 SDLoc dl(InputVector);
47114 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
47115 unsigned NumSrcElts = SrcVT.getVectorNumElements();
47116 unsigned NumEltBits = VT.getScalarSizeInBits();
47117 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47118
47119 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
47120 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47121
47122 // Integer Constant Folding.
47123 if (CIdx && VT.isInteger()) {
47124 APInt UndefVecElts;
47125 SmallVector<APInt, 16> EltBits;
47126 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
47127 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
47128 EltBits, /*AllowWholeUndefs*/ true,
47129 /*AllowPartialUndefs*/ false)) {
47130 uint64_t Idx = CIdx->getZExtValue();
47131 if (UndefVecElts[Idx])
47132 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47133 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
47134 }
47135
47136 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
47137 // Improves lowering of bool masks on rust which splits them into byte array.
47138 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
47139 SDValue Src = peekThroughBitcasts(InputVector);
47140 if (Src.getValueType().getScalarType() == MVT::i1 &&
47141 TLI.isTypeLegal(Src.getValueType())) {
47142 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
47143 SDValue Sub = DAG.getNode(
47144 ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
47145 DAG.getVectorIdxConstant(CIdx->getZExtValue() * NumEltBits, dl));
47146 return DAG.getBitcast(VT, Sub);
47147 }
47148 }
47149 }
47150
47151 if (IsPextr) {
47152 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
47153 DCI))
47154 return SDValue(N, 0);
47155
47156 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
47157 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
47158 InputVector.getOpcode() == X86ISD::PINSRW) &&
47159 InputVector.getOperand(2) == EltIdx) {
47160 assert(SrcVT == InputVector.getOperand(0).getValueType() &&
47161 "Vector type mismatch");
47162 SDValue Scl = InputVector.getOperand(1);
47163 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
47164 return DAG.getZExtOrTrunc(Scl, dl, VT);
47165 }
47166
47167 // TODO - Remove this once we can handle the implicit zero-extension of
47168 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
47169 // combineBasicSADPattern.
47170 return SDValue();
47171 }
47172
47173 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
47174 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
47175 InputVector.getOpcode() == ISD::BITCAST &&
47176 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47177 isNullConstant(EltIdx) && InputVector.hasOneUse())
47178 return DAG.getBitcast(VT, InputVector);
47179
47180 // Detect mmx to i32 conversion through a v2i32 elt extract.
47181 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
47182 InputVector.getOpcode() == ISD::BITCAST &&
47183 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47184 isNullConstant(EltIdx) && InputVector.hasOneUse())
47185 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
47186 InputVector.getOperand(0));
47187
47188 // Check whether this extract is the root of a sum of absolute differences
47189 // pattern. This has to be done here because we really want it to happen
47190 // pre-legalization,
47191 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
47192 return SAD;
47193
47194 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
47195 return VPDPBUSD;
47196
47197 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
47198 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
47199 return Cmp;
47200
47201 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
47202 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
47203 return MinMax;
47204
47205 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
47206 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
47207 return V;
47208
47209 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget, DCI))
47210 return V;
47211
47212 if (CIdx)
47214 N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
47215 dl, DAG, DCI))
47216 return V;
47217
47218 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
47219 // and then testing the relevant element.
47220 //
47221 // Note that we only combine extracts on the *same* result number, i.e.
47222 // t0 = merge_values a0, a1, a2, a3
47223 // i1 = extract_vector_elt t0, Constant:i64<2>
47224 // i1 = extract_vector_elt t0, Constant:i64<3>
47225 // but not
47226 // i1 = extract_vector_elt t0:1, Constant:i64<2>
47227 // since the latter would need its own MOVMSK.
47228 if (SrcVT.getScalarType() == MVT::i1) {
47229 bool IsVar = !CIdx;
47230 SmallVector<SDNode *, 16> BoolExtracts;
47231 unsigned ResNo = InputVector.getResNo();
47232 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
47233 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
47234 Use->getOperand(0).getResNo() == ResNo &&
47235 Use->getValueType(0) == MVT::i1) {
47236 BoolExtracts.push_back(Use);
47237 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
47238 return true;
47239 }
47240 return false;
47241 };
47242 // TODO: Can we drop the oneuse check for constant extracts?
47243 if (all_of(InputVector->users(), IsBoolExtract) &&
47244 (IsVar || BoolExtracts.size() > 1)) {
47245 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
47246 if (SDValue BC =
47247 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
47248 for (SDNode *Use : BoolExtracts) {
47249 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
47250 // Mask = 1 << MaskIdx
47251 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
47252 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
47253 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
47254 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
47255 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
47256 DCI.CombineTo(Use, Res);
47257 }
47258 return SDValue(N, 0);
47259 }
47260 }
47261 }
47262
47263 // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
47264 if (CIdx && InputVector.getOpcode() == ISD::TRUNCATE) {
47265 SDValue TruncSrc = InputVector.getOperand(0);
47266 EVT TruncSVT = TruncSrc.getValueType().getScalarType();
47267 if (DCI.isBeforeLegalize() && TLI.isTypeLegal(TruncSVT)) {
47268 SDValue NewExt =
47269 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TruncSVT, TruncSrc, EltIdx);
47270 return DAG.getAnyExtOrTrunc(NewExt, dl, VT);
47271 }
47272 }
47273
47274 return SDValue();
47275}
47276
47277// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
47278// This is more or less the reverse of combineBitcastvxi1.
47280 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
47281 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
47282 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
47283 Opcode != ISD::ANY_EXTEND)
47284 return SDValue();
47285 if (!DCI.isBeforeLegalizeOps())
47286 return SDValue();
47287 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
47288 return SDValue();
47289
47290 EVT SVT = VT.getScalarType();
47291 EVT InSVT = N0.getValueType().getScalarType();
47292 unsigned EltSizeInBits = SVT.getSizeInBits();
47293
47294 // Input type must be extending a bool vector (bit-casted from a scalar
47295 // integer) to legal integer types.
47296 if (!VT.isVector())
47297 return SDValue();
47298 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
47299 return SDValue();
47300 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
47301 return SDValue();
47302
47303 SDValue N00 = N0.getOperand(0);
47304 EVT SclVT = N00.getValueType();
47305 if (!SclVT.isScalarInteger())
47306 return SDValue();
47307
47308 SDValue Vec;
47309 SmallVector<int> ShuffleMask;
47310 unsigned NumElts = VT.getVectorNumElements();
47311 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
47312
47313 // Broadcast the scalar integer to the vector elements.
47314 if (NumElts > EltSizeInBits) {
47315 // If the scalar integer is greater than the vector element size, then we
47316 // must split it down into sub-sections for broadcasting. For example:
47317 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
47318 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
47319 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
47320 unsigned Scale = NumElts / EltSizeInBits;
47321 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
47322 bool UseBroadcast = Subtarget.hasInt256() &&
47323 (!BroadcastVT.is128BitVector() || isa<LoadSDNode>(N00));
47324 Vec = UseBroadcast
47325 ? DAG.getSplat(BroadcastVT, DL, N00)
47326 : DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
47327 Vec = DAG.getBitcast(VT, Vec);
47328
47329 for (unsigned i = 0; i != Scale; ++i) {
47330 int Offset = UseBroadcast ? (i * EltSizeInBits) : 0;
47331 ShuffleMask.append(EltSizeInBits, i + Offset);
47332 }
47333 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
47334 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
47335 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
47336 // If we have register broadcast instructions, use the scalar size as the
47337 // element type for the shuffle. Then cast to the wider element type. The
47338 // widened bits won't be used, and this might allow the use of a broadcast
47339 // load.
47340 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
47341 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT,
47342 (NumElts * EltSizeInBits) / NumElts);
47343 Vec = DAG.getBitcast(VT, DAG.getSplat(BroadcastVT, DL, N00));
47344 } else {
47345 // For smaller scalar integers, we can simply any-extend it to the vector
47346 // element size (we don't care about the upper bits) and broadcast it to all
47347 // elements.
47348 Vec = DAG.getSplat(VT, DL, DAG.getAnyExtOrTrunc(N00, DL, SVT));
47349 }
47350
47351 // Now, mask the relevant bit in each element.
47353 for (unsigned i = 0; i != NumElts; ++i) {
47354 int BitIdx = (i % EltSizeInBits);
47355 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
47356 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
47357 }
47358 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
47359 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
47360
47361 // Compare against the bitmask and extend the result.
47362 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
47363 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
47364 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
47365
47366 // For SEXT, this is now done, otherwise shift the result down for
47367 // zero-extension.
47368 if (Opcode == ISD::SIGN_EXTEND)
47369 return Vec;
47370 return DAG.getNode(ISD::SRL, DL, VT, Vec,
47371 DAG.getConstant(EltSizeInBits - 1, DL, VT));
47372}
47373
47374/// If both arms of a vector select are concatenated vectors, split the select,
47375/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
47376/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
47377/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
47379 const X86Subtarget &Subtarget) {
47380 unsigned Opcode = N->getOpcode();
47381 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
47382 return SDValue();
47383
47384 // TODO: Split 512-bit vectors too?
47385 EVT VT = N->getValueType(0);
47386 if (!VT.is256BitVector())
47387 return SDValue();
47388
47389 // TODO: Split as long as any 2 of the 3 operands are concatenated?
47390 SDValue Cond = N->getOperand(0);
47391 SDValue TVal = N->getOperand(1);
47392 SDValue FVal = N->getOperand(2);
47393 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
47394 !isFreeToSplitVector(TVal, DAG) || !isFreeToSplitVector(FVal, DAG))
47395 return SDValue();
47396
47397 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
47399 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
47400 };
47401 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Cond, TVal, FVal}, makeBlend,
47402 /*CheckBWI*/ false);
47403}
47404
47406 const SDLoc &DL) {
47407 SDValue Cond = N->getOperand(0);
47408 SDValue LHS = N->getOperand(1);
47409 SDValue RHS = N->getOperand(2);
47410
47411 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
47412 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
47413 if (!TrueC || !FalseC)
47414 return SDValue();
47415
47416 // Don't do this for crazy integer types.
47417 EVT VT = N->getValueType(0);
47418 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
47419 return SDValue();
47420
47421 // We're going to use the condition bit in math or logic ops. We could allow
47422 // this with a wider condition value (post-legalization it becomes an i8),
47423 // but if nothing is creating selects that late, it doesn't matter.
47424 if (Cond.getValueType() != MVT::i1)
47425 return SDValue();
47426
47427 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
47428 // 3, 5, or 9 with i32/i64, so those get transformed too.
47429 // TODO: For constants that overflow or do not differ by power-of-2 or small
47430 // multiplier, convert to 'and' + 'add'.
47431 const APInt &TrueVal = TrueC->getAPIntValue();
47432 const APInt &FalseVal = FalseC->getAPIntValue();
47433
47434 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
47435 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
47436 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
47437 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47438 if (CC == ISD::SETEQ || CC == ISD::SETNE)
47439 return SDValue();
47440 }
47441
47442 bool OV;
47443 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
47444 if (OV)
47445 return SDValue();
47446
47447 APInt AbsDiff = Diff.abs();
47448 if (AbsDiff.isPowerOf2() ||
47449 ((VT == MVT::i32 || VT == MVT::i64) &&
47450 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
47451
47452 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
47453 // of the condition can usually be folded into a compare predicate, but even
47454 // without that, the sequence should be cheaper than a CMOV alternative.
47455 if (TrueVal.slt(FalseVal)) {
47456 Cond = DAG.getNOT(DL, Cond, MVT::i1);
47457 std::swap(TrueC, FalseC);
47458 }
47459
47460 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
47461 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
47462
47463 // Multiply condition by the difference if non-one.
47464 if (!AbsDiff.isOne())
47465 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
47466
47467 // Add the base if non-zero.
47468 if (!FalseC->isZero())
47469 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
47470
47471 return R;
47472 }
47473
47474 return SDValue();
47475}
47476
47477/// If this is a *dynamic* select (non-constant condition) and we can match
47478/// this node with one of the variable blend instructions, restructure the
47479/// condition so that blends can use the high (sign) bit of each element.
47480/// This function will also call SimplifyDemandedBits on already created
47481/// BLENDV to perform additional simplifications.
47483 const SDLoc &DL,
47485 const X86Subtarget &Subtarget) {
47486 SDValue Cond = N->getOperand(0);
47487 if ((N->getOpcode() != ISD::VSELECT &&
47488 N->getOpcode() != X86ISD::BLENDV) ||
47490 return SDValue();
47491
47492 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47493 unsigned BitWidth = Cond.getScalarValueSizeInBits();
47494 EVT VT = N->getValueType(0);
47495
47496 // We can only handle the cases where VSELECT is directly legal on the
47497 // subtarget. We custom lower VSELECT nodes with constant conditions and
47498 // this makes it hard to see whether a dynamic VSELECT will correctly
47499 // lower, so we both check the operation's status and explicitly handle the
47500 // cases where a *dynamic* blend will fail even though a constant-condition
47501 // blend could be custom lowered.
47502 // FIXME: We should find a better way to handle this class of problems.
47503 // Potentially, we should combine constant-condition vselect nodes
47504 // pre-legalization into shuffles and not mark as many types as custom
47505 // lowered.
47507 return SDValue();
47508 // FIXME: We don't support i16-element blends currently. We could and
47509 // should support them by making *all* the bits in the condition be set
47510 // rather than just the high bit and using an i8-element blend.
47511 if (VT.getVectorElementType() == MVT::i16)
47512 return SDValue();
47513 // Dynamic blending was only available from SSE4.1 onward.
47514 if (VT.is128BitVector() && !Subtarget.hasSSE41())
47515 return SDValue();
47516 // Byte blends are only available in AVX2
47517 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
47518 return SDValue();
47519 // There are no 512-bit blend instructions that use sign bits.
47520 if (VT.is512BitVector())
47521 return SDValue();
47522
47523 // Don't optimize before the condition has been transformed to a legal type
47524 // and don't ever optimize vector selects that map to AVX512 mask-registers.
47526 return SDValue();
47527
47528 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
47529 for (SDUse &Use : Cond->uses())
47530 if ((Use.getUser()->getOpcode() != ISD::VSELECT &&
47531 Use.getUser()->getOpcode() != X86ISD::BLENDV) ||
47532 Use.getOperandNo() != 0)
47533 return false;
47534
47535 return true;
47536 };
47537
47539
47540 if (OnlyUsedAsSelectCond(Cond)) {
47541 KnownBits Known;
47543 !DCI.isBeforeLegalizeOps());
47544 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
47545 return SDValue();
47546
47547 // If we changed the computation somewhere in the DAG, this change will
47548 // affect all users of Cond. Update all the nodes so that we do not use
47549 // the generic VSELECT anymore. Otherwise, we may perform wrong
47550 // optimizations as we messed with the actual expectation for the vector
47551 // boolean values.
47552 for (SDNode *U : Cond->users()) {
47553 if (U->getOpcode() == X86ISD::BLENDV)
47554 continue;
47555
47556 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
47557 Cond, U->getOperand(1), U->getOperand(2));
47558 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
47559 DCI.AddToWorklist(U);
47560 }
47561 DCI.CommitTargetLoweringOpt(TLO);
47562 return SDValue(N, 0);
47563 }
47564
47565 // Otherwise we can still at least try to simplify multiple use bits.
47567 return DAG.getNode(X86ISD::BLENDV, DL, N->getValueType(0), V,
47568 N->getOperand(1), N->getOperand(2));
47569
47570 return SDValue();
47571}
47572
47573// Try to match:
47574// (or (and (M, (sub 0, X)), (pandn M, X)))
47575// which is a special case of:
47576// (select M, (sub 0, X), X)
47577// Per:
47578// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
47579// We know that, if fNegate is 0 or 1:
47580// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
47581//
47582// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
47583// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
47584// ( M ? -X : X) == ((X ^ M ) + (M & 1))
47585// This lets us transform our vselect to:
47586// (add (xor X, M), (and M, 1))
47587// And further to:
47588// (sub (xor X, M), M)
47590 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
47591 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
47592 using namespace SDPatternMatch;
47593 EVT MaskVT = Mask.getValueType();
47594 assert(MaskVT.isInteger() &&
47595 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
47596 "Mask must be zero/all-bits");
47597
47598 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT ||
47600 return SDValue();
47601
47602 SDValue V;
47603 if (!sd_match(Y, m_Neg(m_AllOf(m_Specific(X), m_Value(V)))) &&
47605 return SDValue();
47606
47607 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
47608 SDValue SubOp2 = Mask;
47609
47610 // If the negate was on the false side of the select, then
47611 // the operands of the SUB need to be swapped. PR 27251.
47612 // This is because the pattern being matched above is
47613 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
47614 // but if the pattern matched was
47615 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
47616 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
47617 // pattern also needs to be a negation of the replacement pattern above.
47618 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
47619 // sub accomplishes the negation of the replacement pattern.
47620 if (V == Y)
47621 std::swap(SubOp1, SubOp2);
47622
47623 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
47624 return DAG.getBitcast(VT, Res);
47625}
47626
47628 const X86Subtarget &Subtarget) {
47629 using namespace SDPatternMatch;
47630 if (!Subtarget.hasAVX512())
47631 return SDValue();
47632
47633 ISD::CondCode CC;
47634 SDValue Cond, X, Y, LHS, RHS;
47637 m_CondCode(CC)))),
47638 m_Value(LHS), m_Value(RHS))))
47639 return SDValue();
47640
47641 if (canCombineAsMaskOperation(LHS, Subtarget) ||
47642 !canCombineAsMaskOperation(RHS, Subtarget))
47643 return SDValue();
47644
47645 // Commute LHS and RHS to create opportunity to select mask instruction.
47646 // (vselect M, L, R) -> (vselect ~M, R, L)
47647 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, X.getValueType());
47648 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), X, Y, NewCC);
47649 return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
47650}
47651
47652/// Do target-specific dag combines on SELECT and VSELECT nodes.
47655 const X86Subtarget &Subtarget) {
47656 SDLoc DL(N);
47657 SDValue Cond = N->getOperand(0);
47658 SDValue LHS = N->getOperand(1);
47659 SDValue RHS = N->getOperand(2);
47660
47661 // Try simplification again because we use this function to optimize
47662 // BLENDV nodes that are not handled by the generic combiner.
47663 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
47664 return V;
47665
47666 // When avx512 is available the lhs operand of select instruction can be
47667 // folded with mask instruction, while the rhs operand can't. Commute the
47668 // lhs and rhs of the select instruction to create the opportunity of
47669 // folding.
47670 if (SDValue V = commuteSelect(N, DAG, DL, Subtarget))
47671 return V;
47672
47673 EVT VT = LHS.getValueType();
47674 EVT CondVT = Cond.getValueType();
47675 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47676 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
47677
47678 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
47679 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
47680 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
47681 if (CondVT.isVector() && CondVT.isInteger() &&
47682 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
47683 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
47686 DL, DAG, Subtarget))
47687 return V;
47688
47689 if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
47690 SmallVector<int, 64> CondMask;
47691 if (createShuffleMaskFromVSELECT(CondMask, Cond,
47692 N->getOpcode() == X86ISD::BLENDV)) {
47693 // Convert vselects with constant condition into shuffles.
47694 if (DCI.isBeforeLegalizeOps())
47695 return DAG.getVectorShuffle(VT, DL, LHS, RHS, CondMask);
47696
47697 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
47698 // by forcing the unselected elements to zero.
47699 // TODO: Can we handle more shuffles with this?
47700 if (LHS.hasOneUse() && RHS.hasOneUse()) {
47701 SmallVector<SDValue, 1> LHSOps, RHSOps;
47702 SmallVector<int, 64> LHSMask, RHSMask, ByteMask;
47705 if (LHSShuf.getOpcode() == X86ISD::PSHUFB &&
47706 RHSShuf.getOpcode() == X86ISD::PSHUFB &&
47707 scaleShuffleMaskElts(VT.getSizeInBits() / 8, CondMask, ByteMask) &&
47708 getTargetShuffleMask(LHSShuf, true, LHSOps, LHSMask) &&
47709 getTargetShuffleMask(RHSShuf, true, RHSOps, RHSMask)) {
47710 assert(ByteMask.size() == LHSMask.size() &&
47711 ByteMask.size() == RHSMask.size() && "Shuffle mask mismatch");
47712 for (auto [I, M] : enumerate(ByteMask)) {
47713 // getConstVector sets negative shuffle mask values as undef, so
47714 // ensure we hardcode SM_SentinelZero values to zero (0x80).
47715 if (M < (int)ByteMask.size()) {
47716 LHSMask[I] = isUndefOrZero(LHSMask[I]) ? 0x80 : LHSMask[I];
47717 RHSMask[I] = 0x80;
47718 } else {
47719 LHSMask[I] = 0x80;
47720 RHSMask[I] = isUndefOrZero(RHSMask[I]) ? 0x80 : RHSMask[I];
47721 }
47722 }
47723 MVT ByteVT = LHSShuf.getSimpleValueType();
47724 LHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, LHSOps[0],
47725 getConstVector(LHSMask, ByteVT, DAG, DL, true));
47726 RHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, RHSOps[0],
47727 getConstVector(RHSMask, ByteVT, DAG, DL, true));
47728 return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, ByteVT, LHS, RHS));
47729 }
47730 }
47731
47732 // Attempt to combine as shuffle.
47733 SDValue Op(N, 0);
47734 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47735 return Res;
47736 }
47737 }
47738
47739 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
47740 // instructions match the semantics of the common C idiom x<y?x:y but not
47741 // x<=y?x:y, because of how they handle negative zero (which can be
47742 // ignored in unsafe-math mode).
47743 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
47744 if ((Cond.getOpcode() == ISD::SETCC ||
47745 Cond.getOpcode() == ISD::STRICT_FSETCCS) &&
47746 VT.isFloatingPoint() && VT != MVT::f80 && VT != MVT::f128 &&
47747 !isSoftF16(VT, Subtarget) && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
47748 ((VT != MVT::v8f16 && VT != MVT::v16f16) || Subtarget.hasVLX()) &&
47749 (Subtarget.hasSSE2() ||
47750 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
47751 bool IsStrict = Cond->isStrictFPOpcode();
47752 ISD::CondCode CC =
47753 cast<CondCodeSDNode>(Cond.getOperand(IsStrict ? 3 : 2))->get();
47754 SDValue Op0 = Cond.getOperand(IsStrict ? 1 : 0);
47755 SDValue Op1 = Cond.getOperand(IsStrict ? 2 : 1);
47756
47757 unsigned Opcode = 0;
47758 // Check for x CC y ? x : y.
47759 if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) {
47760 switch (CC) {
47761 default: break;
47762 case ISD::SETULT:
47763 // Converting this to a min would handle NaNs incorrectly, and swapping
47764 // the operands would cause it to handle comparisons between positive
47765 // and negative zero incorrectly.
47766 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47768 !(DAG.isKnownNeverZeroFloat(LHS) ||
47770 break;
47771 std::swap(LHS, RHS);
47772 }
47773 Opcode = X86ISD::FMIN;
47774 break;
47775 case ISD::SETOLE:
47776 // Converting this to a min would handle comparisons between positive
47777 // and negative zero incorrectly.
47780 break;
47781 Opcode = X86ISD::FMIN;
47782 break;
47783 case ISD::SETULE:
47784 // Converting this to a min would handle both negative zeros and NaNs
47785 // incorrectly, but we can swap the operands to fix both.
47786 std::swap(LHS, RHS);
47787 [[fallthrough]];
47788 case ISD::SETOLT:
47789 case ISD::SETLT:
47790 case ISD::SETLE:
47791 Opcode = X86ISD::FMIN;
47792 break;
47793
47794 case ISD::SETOGE:
47795 // Converting this to a max would handle comparisons between positive
47796 // and negative zero incorrectly.
47799 break;
47800 Opcode = X86ISD::FMAX;
47801 break;
47802 case ISD::SETUGT:
47803 // Converting this to a max would handle NaNs incorrectly, and swapping
47804 // the operands would cause it to handle comparisons between positive
47805 // and negative zero incorrectly.
47806 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47808 !(DAG.isKnownNeverZeroFloat(LHS) ||
47810 break;
47811 std::swap(LHS, RHS);
47812 }
47813 Opcode = X86ISD::FMAX;
47814 break;
47815 case ISD::SETUGE:
47816 // Converting this to a max would handle both negative zeros and NaNs
47817 // incorrectly, but we can swap the operands to fix both.
47818 std::swap(LHS, RHS);
47819 [[fallthrough]];
47820 case ISD::SETOGT:
47821 case ISD::SETGT:
47822 case ISD::SETGE:
47823 Opcode = X86ISD::FMAX;
47824 break;
47825 }
47826 // Check for x CC y ? y : x -- a min/max with reversed arms.
47827 } else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) {
47828 switch (CC) {
47829 default: break;
47830 case ISD::SETOGE:
47831 // Converting this to a min would handle comparisons between positive
47832 // and negative zero incorrectly, and swapping the operands would
47833 // cause it to handle NaNs incorrectly.
47835 !(DAG.isKnownNeverZeroFloat(LHS) ||
47836 DAG.isKnownNeverZeroFloat(RHS))) {
47837 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47838 break;
47839 std::swap(LHS, RHS);
47840 }
47841 Opcode = X86ISD::FMIN;
47842 break;
47843 case ISD::SETUGT:
47844 // Converting this to a min would handle NaNs incorrectly.
47845 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47846 break;
47847 Opcode = X86ISD::FMIN;
47848 break;
47849 case ISD::SETUGE:
47850 // Converting this to a min would handle both negative zeros and NaNs
47851 // incorrectly, but we can swap the operands to fix both.
47852 std::swap(LHS, RHS);
47853 [[fallthrough]];
47854 case ISD::SETOGT:
47855 case ISD::SETGT:
47856 case ISD::SETGE:
47857 Opcode = X86ISD::FMIN;
47858 break;
47859
47860 case ISD::SETULT:
47861 // Converting this to a max would handle NaNs incorrectly.
47862 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47863 break;
47864 Opcode = X86ISD::FMAX;
47865 break;
47866 case ISD::SETOLE:
47867 // Converting this to a max would handle comparisons between positive
47868 // and negative zero incorrectly, and swapping the operands would
47869 // cause it to handle NaNs incorrectly.
47871 !DAG.isKnownNeverZeroFloat(LHS) &&
47872 !DAG.isKnownNeverZeroFloat(RHS)) {
47873 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47874 break;
47875 std::swap(LHS, RHS);
47876 }
47877 Opcode = X86ISD::FMAX;
47878 break;
47879 case ISD::SETULE:
47880 // Converting this to a max would handle both negative zeros and NaNs
47881 // incorrectly, but we can swap the operands to fix both.
47882 std::swap(LHS, RHS);
47883 [[fallthrough]];
47884 case ISD::SETOLT:
47885 case ISD::SETLT:
47886 case ISD::SETLE:
47887 Opcode = X86ISD::FMAX;
47888 break;
47889 }
47890 }
47891
47892 if (Opcode) {
47893 if (IsStrict) {
47894 SDValue Ret = DAG.getNode(Opcode == X86ISD::FMIN ? X86ISD::STRICT_FMIN
47896 DL, {N->getValueType(0), MVT::Other},
47897 {Cond.getOperand(0), LHS, RHS});
47898 DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Ret.getValue(1));
47899 return Ret;
47900 }
47901 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
47902 }
47903 }
47904
47905 // Some mask scalar intrinsics rely on checking if only one bit is set
47906 // and implement it in C code like this:
47907 // A[0] = (U & 1) ? A[0] : W[0];
47908 // This creates some redundant instructions that break pattern matching.
47909 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
47910 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
47911 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
47912 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47913 SDValue AndNode = Cond.getOperand(0);
47914 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
47915 isNullConstant(Cond.getOperand(1)) &&
47916 isOneConstant(AndNode.getOperand(1))) {
47917 // LHS and RHS swapped due to
47918 // setcc outputting 1 when AND resulted in 0 and vice versa.
47919 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
47920 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
47921 }
47922 }
47923
47924 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
47925 // lowering on KNL. In this case we convert it to
47926 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
47927 // The same situation all vectors of i8 and i16 without BWI.
47928 // Make sure we extend these even before type legalization gets a chance to
47929 // split wide vectors.
47930 // Since SKX these selects have a proper lowering.
47931 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
47932 CondVT.getVectorElementType() == MVT::i1 &&
47933 (VT.getVectorElementType() == MVT::i8 ||
47934 VT.getVectorElementType() == MVT::i16)) {
47935 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
47936 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
47937 }
47938
47939 // AVX512 - Extend select to merge with target shuffle.
47940 // select(mask, extract_subvector(shuffle(x)), y) -->
47941 // extract_subvector(select(widen(mask), shuffle(x), widen(y)))
47942 // TODO - support non target shuffles as well with canCombineAsMaskOperation.
47943 if (Subtarget.hasAVX512() && CondVT.isVector() &&
47944 CondVT.getVectorElementType() == MVT::i1) {
47945 auto SelectableOp = [&TLI](SDValue Op, SDValue Alt) {
47946 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
47947 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
47948 isNullConstant(Op.getOperand(1)) &&
47949 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
47950 Op.hasOneUse() && Op.getOperand(0).hasOneUse() &&
47951 (Op.getOperand(0).getOpcode() != X86ISD::VPERMV3 ||
47952 ISD::isBuildVectorAllZeros(Alt.getNode()));
47953 };
47954
47955 bool SelectableLHS = SelectableOp(LHS, RHS);
47956 bool SelectableRHS = SelectableOp(RHS, LHS);
47957 if (SelectableLHS || SelectableRHS) {
47958 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
47959 : RHS.getOperand(0).getValueType();
47960 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
47961 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
47962 VT.getSizeInBits());
47963 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
47964 VT.getSizeInBits());
47965 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
47966 DAG.getUNDEF(SrcCondVT), Cond,
47967 DAG.getVectorIdxConstant(0, DL));
47968 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
47969 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
47970 }
47971 }
47972
47973 if (SDValue V = combineSelectOfTwoConstants(N, DAG, DL))
47974 return V;
47975
47976 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
47977 Cond.hasOneUse()) {
47978 EVT CondVT = Cond.getValueType();
47979 SDValue Cond0 = Cond.getOperand(0);
47980 SDValue Cond1 = Cond.getOperand(1);
47981 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47982
47983 // Canonicalize min/max:
47984 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
47985 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
47986 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
47987 // the need for an extra compare against zero. e.g.
47988 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
47989 // subl %esi, %edi
47990 // testl %edi, %edi
47991 // movl $0, %eax
47992 // cmovgl %edi, %eax
47993 // =>
47994 // xorl %eax, %eax
47995 // subl %esi, $edi
47996 // cmovsl %eax, %edi
47997 //
47998 // We can also canonicalize
47999 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
48000 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
48001 // This allows the use of a test instruction for the compare.
48002 if (LHS == Cond0 && RHS == Cond1) {
48003 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
48004 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
48006 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48007 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48008 }
48009 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
48010 ISD::CondCode NewCC = ISD::SETUGE;
48011 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48012 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48013 }
48014 }
48015
48016 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
48017 // fold eq + gt/lt nested selects into ge/le selects
48018 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
48019 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
48020 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
48021 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
48022 // .. etc ..
48023 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
48024 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
48025 SDValue InnerSetCC = RHS.getOperand(0);
48026 ISD::CondCode InnerCC =
48027 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
48028 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
48029 Cond0 == InnerSetCC.getOperand(0) &&
48030 Cond1 == InnerSetCC.getOperand(1)) {
48031 ISD::CondCode NewCC;
48032 switch (CC == ISD::SETEQ ? InnerCC : CC) {
48033 // clang-format off
48034 case ISD::SETGT: NewCC = ISD::SETGE; break;
48035 case ISD::SETLT: NewCC = ISD::SETLE; break;
48036 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
48037 case ISD::SETULT: NewCC = ISD::SETULE; break;
48038 default: NewCC = ISD::SETCC_INVALID; break;
48039 // clang-format on
48040 }
48041 if (NewCC != ISD::SETCC_INVALID) {
48042 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
48043 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
48044 }
48045 }
48046 }
48047 }
48048
48049 // Check if the first operand is all zeros and Cond type is vXi1.
48050 // If this an avx512 target we can improve the use of zero masking by
48051 // swapping the operands and inverting the condition.
48052 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
48053 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
48054 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
48055 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
48056 // Invert the cond to not(cond) : xor(op,allones)=not(op)
48057 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
48058 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
48059 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
48060 }
48061
48062 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
48063 // get split by legalization.
48064 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
48065 CondVT.getVectorElementType() == MVT::i1 &&
48066 TLI.isTypeLegal(VT.getScalarType())) {
48067 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
48069 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
48070 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
48071 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
48072 }
48073 }
48074
48075 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
48076 // with out-of-bounds clamping.
48077
48078 // Unlike general shift instructions (SHL/SRL), AVX2's VSHLV/VSRLV handle
48079 // shift amounts exceeding the element bitwidth. VSHLV/VSRLV clamps the amount
48080 // to bitwidth-1 for unsigned shifts, effectively performing a maximum left
48081 // shift of bitwidth-1 positions. and returns zero for unsigned right shifts
48082 // exceeding bitwidth-1.
48083 if (N->getOpcode() == ISD::VSELECT) {
48084 using namespace llvm::SDPatternMatch;
48085 // fold select(icmp_ult(amt,BW),shl(x,amt),0) -> avx2 psllv(x,amt)
48086 // fold select(icmp_ult(amt,BW),srl(x,amt),0) -> avx2 psrlv(x,amt)
48087 if ((LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SHL) &&
48088 supportedVectorVarShift(VT, Subtarget, LHS.getOpcode()) &&
48090 sd_match(Cond, m_SetCC(m_Specific(LHS.getOperand(1)),
48093 return DAG.getNode(LHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48094 : X86ISD::VSHLV,
48095 DL, VT, LHS.getOperand(0), LHS.getOperand(1));
48096 }
48097 // fold select(icmp_uge(amt,BW),0,shl(x,amt)) -> avx2 psllv(x,amt)
48098 // fold select(icmp_uge(amt,BW),0,srl(x,amt)) -> avx2 psrlv(x,amt)
48099 if ((RHS.getOpcode() == ISD::SRL || RHS.getOpcode() == ISD::SHL) &&
48100 supportedVectorVarShift(VT, Subtarget, RHS.getOpcode()) &&
48102 sd_match(Cond, m_SetCC(m_Specific(RHS.getOperand(1)),
48105 return DAG.getNode(RHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48106 : X86ISD::VSHLV,
48107 DL, VT, RHS.getOperand(0), RHS.getOperand(1));
48108 }
48109 }
48110
48111 // Early exit check
48112 if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
48113 return SDValue();
48114
48115 if (SDValue V = combineVSelectToBLENDV(N, DAG, DL, DCI, Subtarget))
48116 return V;
48117
48118 if (SDValue V = narrowVectorSelect(N, DAG, DL, Subtarget))
48119 return V;
48120
48121 // select(~Cond, X, Y) -> select(Cond, Y, X)
48122 if (CondVT.getScalarType() != MVT::i1) {
48123 if (SDValue CondNot = IsNOT(Cond, DAG))
48124 return DAG.getNode(N->getOpcode(), DL, VT,
48125 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
48126
48127 // select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
48128 if (Cond.getOpcode() == X86ISD::PCMPEQ &&
48129 Cond.getOperand(0).getOpcode() == ISD::AND &&
48130 ISD::isBuildVectorAllZeros(Cond.getOperand(1).getNode()) &&
48131 isConstantPowerOf2(Cond.getOperand(0).getOperand(1),
48132 Cond.getScalarValueSizeInBits(),
48133 /*AllowUndefs=*/true) &&
48134 Cond.hasOneUse()) {
48135 Cond = DAG.getNode(X86ISD::PCMPEQ, DL, CondVT, Cond.getOperand(0),
48136 Cond.getOperand(0).getOperand(1));
48137 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48138 }
48139
48140 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
48141 // signbit.
48142 if (Cond.getOpcode() == X86ISD::PCMPGT &&
48143 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
48144 Cond.hasOneUse()) {
48145 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
48146 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
48147 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48148 }
48149 }
48150
48151 // Try to optimize vXi1 selects if both operands are either all constants or
48152 // bitcasts from scalar integer type. In that case we can convert the operands
48153 // to integer and use an integer select which will be converted to a CMOV.
48154 // We need to take a little bit of care to avoid creating an i64 type after
48155 // type legalization.
48156 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
48157 VT.getVectorElementType() == MVT::i1 &&
48158 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
48160 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
48161 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
48162 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
48163
48164 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
48165 LHS.getOperand(0).getValueType() == IntVT)) &&
48166 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
48167 RHS.getOperand(0).getValueType() == IntVT))) {
48168 if (LHSIsConst)
48170 else
48171 LHS = LHS.getOperand(0);
48172
48173 if (RHSIsConst)
48175 else
48176 RHS = RHS.getOperand(0);
48177
48178 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
48179 return DAG.getBitcast(VT, Select);
48180 }
48181 }
48182 }
48183
48184 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
48185 // single bits, then invert the predicate and swap the select operands.
48186 // This can lower using a vector shift bit-hack rather than mask and compare.
48187 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
48188 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
48189 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
48190 Cond.getOperand(0).getOpcode() == ISD::AND &&
48191 isNullOrNullSplat(Cond.getOperand(1)) &&
48192 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
48193 Cond.getOperand(0).getValueType() == VT) {
48194 // The 'and' mask must be composed of power-of-2 constants.
48195 SDValue And = Cond.getOperand(0);
48196 auto *C = isConstOrConstSplat(And.getOperand(1));
48197 if (C && C->getAPIntValue().isPowerOf2()) {
48198 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
48199 SDValue NotCond =
48200 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
48201 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
48202 }
48203
48204 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
48205 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
48206 // 16-bit lacks a proper blendv.
48207 unsigned EltBitWidth = VT.getScalarSizeInBits();
48208 bool CanShiftBlend =
48209 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
48210 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
48211 (Subtarget.hasXOP()));
48212 if (CanShiftBlend &&
48213 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
48214 return C->getAPIntValue().isPowerOf2();
48215 })) {
48216 // Create a left-shift constant to get the mask bits over to the sign-bit.
48217 SDValue Mask = And.getOperand(1);
48218 SmallVector<int, 32> ShlVals;
48219 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
48220 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
48221 ShlVals.push_back(EltBitWidth - 1 -
48222 MaskVal->getAPIntValue().exactLogBase2());
48223 }
48224 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
48225 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
48226 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
48227 SDValue NewCond =
48228 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
48229 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
48230 }
48231 }
48232
48233 return SDValue();
48234}
48235
48236/// Combine:
48237/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
48238/// to:
48239/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
48240/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
48241/// Note that this is only legal for some op/cc combinations.
48243 SelectionDAG &DAG,
48244 const X86Subtarget &Subtarget) {
48245 // This combine only operates on CMP-like nodes.
48246 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48247 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48248 return SDValue();
48249
48250 // Can't replace the cmp if it has more uses than the one we're looking at.
48251 // FIXME: We would like to be able to handle this, but would need to make sure
48252 // all uses were updated.
48253 if (!Cmp.hasOneUse())
48254 return SDValue();
48255
48256 // This only applies to variations of the common case:
48257 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
48258 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
48259 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
48260 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
48261 // Using the proper condcodes (see below), overflow is checked for.
48262
48263 // FIXME: We can generalize both constraints:
48264 // - XOR/OR/AND (if they were made to survive AtomicExpand)
48265 // - LHS != 1
48266 // if the result is compared.
48267
48268 SDValue CmpLHS = Cmp.getOperand(0);
48269 SDValue CmpRHS = Cmp.getOperand(1);
48270 EVT CmpVT = CmpLHS.getValueType();
48271
48272 if (!CmpLHS.hasOneUse())
48273 return SDValue();
48274
48275 unsigned Opc = CmpLHS.getOpcode();
48276 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
48277 return SDValue();
48278
48279 SDValue OpRHS = CmpLHS.getOperand(2);
48280 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
48281 if (!OpRHSC)
48282 return SDValue();
48283
48284 APInt Addend = OpRHSC->getAPIntValue();
48285 if (Opc == ISD::ATOMIC_LOAD_SUB)
48286 Addend = -Addend;
48287
48288 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
48289 if (!CmpRHSC)
48290 return SDValue();
48291
48292 APInt Comparison = CmpRHSC->getAPIntValue();
48293 APInt NegAddend = -Addend;
48294
48295 // See if we can adjust the CC to make the comparison match the negated
48296 // addend.
48297 if (Comparison != NegAddend) {
48298 APInt IncComparison = Comparison + 1;
48299 if (IncComparison == NegAddend) {
48300 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
48301 Comparison = IncComparison;
48302 CC = X86::COND_AE;
48303 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
48304 Comparison = IncComparison;
48305 CC = X86::COND_L;
48306 }
48307 }
48308 APInt DecComparison = Comparison - 1;
48309 if (DecComparison == NegAddend) {
48310 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
48311 Comparison = DecComparison;
48312 CC = X86::COND_A;
48313 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
48314 Comparison = DecComparison;
48315 CC = X86::COND_LE;
48316 }
48317 }
48318 }
48319
48320 // If the addend is the negation of the comparison value, then we can do
48321 // a full comparison by emitting the atomic arithmetic as a locked sub.
48322 if (Comparison == NegAddend) {
48323 // The CC is fine, but we need to rewrite the LHS of the comparison as an
48324 // atomic sub.
48325 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
48326 auto AtomicSub = DAG.getAtomic(
48327 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
48328 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
48329 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
48330 AN->getMemOperand());
48331 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
48332 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48333 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48334 return LockOp;
48335 }
48336
48337 // We can handle comparisons with zero in a number of cases by manipulating
48338 // the CC used.
48339 if (!Comparison.isZero())
48340 return SDValue();
48341
48342 if (CC == X86::COND_S && Addend == 1)
48343 CC = X86::COND_LE;
48344 else if (CC == X86::COND_NS && Addend == 1)
48345 CC = X86::COND_G;
48346 else if (CC == X86::COND_G && Addend == -1)
48347 CC = X86::COND_GE;
48348 else if (CC == X86::COND_LE && Addend == -1)
48349 CC = X86::COND_L;
48350 else
48351 return SDValue();
48352
48353 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
48354 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48355 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48356 return LockOp;
48357}
48358
48359// Check whether we're just testing the signbit, and whether we can simplify
48360// this by tracking where the signbit came from.
48362 SelectionDAG &DAG) {
48363 if (CC != X86::COND_S && CC != X86::COND_NS)
48364 return SDValue();
48365
48366 if (!Cmp.hasOneUse())
48367 return SDValue();
48368
48369 SDValue Src;
48370 if (Cmp.getOpcode() == X86ISD::CMP) {
48371 // CMP(X,0) -> signbit test
48372 if (!isNullConstant(Cmp.getOperand(1)))
48373 return SDValue();
48374 Src = Cmp.getOperand(0);
48375 // Peek through a SRA node as we just need the signbit.
48376 // TODO: Remove one use limit once sdiv-fix regressions are fixed.
48377 // TODO: Use SimplifyDemandedBits instead of just SRA?
48378 if (Src.getOpcode() != ISD::SRA || !Src.hasOneUse())
48379 return SDValue();
48380 Src = Src.getOperand(0);
48381 } else if (Cmp.getOpcode() == X86ISD::OR) {
48382 // OR(X,Y) -> see if only one operand contributes to the signbit.
48383 // TODO: XOR(X,Y) -> see if only one operand contributes to the signbit.
48384 if (DAG.SignBitIsZero(Cmp.getOperand(0)))
48385 Src = Cmp.getOperand(1);
48386 else if (DAG.SignBitIsZero(Cmp.getOperand(1)))
48387 Src = Cmp.getOperand(0);
48388 else
48389 return SDValue();
48390 } else {
48391 return SDValue();
48392 }
48393
48394 // Replace with a TEST on the MSB.
48395 SDLoc DL(Cmp);
48396 MVT SrcVT = Src.getSimpleValueType();
48397 APInt BitMask = APInt::getSignMask(SrcVT.getScalarSizeInBits());
48398
48399 // If Src came from a SHL (probably from an expanded SIGN_EXTEND_INREG), then
48400 // peek through and adjust the TEST bit.
48401 if (Src.getOpcode() == ISD::SHL) {
48402 if (std::optional<unsigned> ShiftAmt = DAG.getValidShiftAmount(Src)) {
48403 Src = Src.getOperand(0);
48404 BitMask.lshrInPlace(*ShiftAmt);
48405 }
48406 }
48407
48408 SDValue Mask = DAG.getNode(ISD::AND, DL, SrcVT, Src,
48409 DAG.getConstant(BitMask, DL, SrcVT));
48410 CC = CC == X86::COND_S ? X86::COND_NE : X86::COND_E;
48411 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Mask,
48412 DAG.getConstant(0, DL, SrcVT));
48413}
48414
48415// Check whether a boolean test is testing a boolean value generated by
48416// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
48417// code.
48418//
48419// Simplify the following patterns:
48420// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
48421// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
48422// to (Op EFLAGS Cond)
48423//
48424// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
48425// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
48426// to (Op EFLAGS !Cond)
48427//
48428// where Op could be BRCOND or CMOV.
48429//
48431 // This combine only operates on CMP-like nodes.
48432 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48433 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48434 return SDValue();
48435
48436 // Quit if not used as a boolean value.
48437 if (CC != X86::COND_E && CC != X86::COND_NE)
48438 return SDValue();
48439
48440 // Check CMP operands. One of them should be 0 or 1 and the other should be
48441 // an SetCC or extended from it.
48442 SDValue Op1 = Cmp.getOperand(0);
48443 SDValue Op2 = Cmp.getOperand(1);
48444
48445 SDValue SetCC;
48446 const ConstantSDNode* C = nullptr;
48447 bool needOppositeCond = (CC == X86::COND_E);
48448 bool checkAgainstTrue = false; // Is it a comparison against 1?
48449
48450 if ((C = dyn_cast<ConstantSDNode>(Op1)))
48451 SetCC = Op2;
48452 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
48453 SetCC = Op1;
48454 else // Quit if all operands are not constants.
48455 return SDValue();
48456
48457 if (C->getZExtValue() == 1) {
48458 needOppositeCond = !needOppositeCond;
48459 checkAgainstTrue = true;
48460 } else if (C->getZExtValue() != 0)
48461 // Quit if the constant is neither 0 or 1.
48462 return SDValue();
48463
48464 bool truncatedToBoolWithAnd = false;
48465 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
48466 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
48467 SetCC.getOpcode() == ISD::TRUNCATE ||
48468 SetCC.getOpcode() == ISD::AND) {
48469 if (SetCC.getOpcode() == ISD::AND) {
48470 int OpIdx = -1;
48471 if (isOneConstant(SetCC.getOperand(0)))
48472 OpIdx = 1;
48473 if (isOneConstant(SetCC.getOperand(1)))
48474 OpIdx = 0;
48475 if (OpIdx < 0)
48476 break;
48477 SetCC = SetCC.getOperand(OpIdx);
48478 truncatedToBoolWithAnd = true;
48479 } else
48480 SetCC = SetCC.getOperand(0);
48481 }
48482
48483 switch (SetCC.getOpcode()) {
48485 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
48486 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
48487 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
48488 // truncated to i1 using 'and'.
48489 if (checkAgainstTrue && !truncatedToBoolWithAnd)
48490 break;
48492 "Invalid use of SETCC_CARRY!");
48493 [[fallthrough]];
48494 case X86ISD::SETCC:
48495 // Set the condition code or opposite one if necessary.
48496 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
48497 if (needOppositeCond)
48499 return SetCC.getOperand(1);
48500 case X86ISD::CMOV: {
48501 // Check whether false/true value has canonical one, i.e. 0 or 1.
48504 // Quit if true value is not a constant.
48505 if (!TVal)
48506 return SDValue();
48507 // Quit if false value is not a constant.
48508 if (!FVal) {
48509 SDValue Op = SetCC.getOperand(0);
48510 // Skip 'zext' or 'trunc' node.
48511 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
48512 Op.getOpcode() == ISD::TRUNCATE)
48513 Op = Op.getOperand(0);
48514 // A special case for rdrand/rdseed, where 0 is set if false cond is
48515 // found.
48516 if ((Op.getOpcode() != X86ISD::RDRAND &&
48517 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
48518 return SDValue();
48519 }
48520 // Quit if false value is not the constant 0 or 1.
48521 bool FValIsFalse = true;
48522 if (FVal && FVal->getZExtValue() != 0) {
48523 if (FVal->getZExtValue() != 1)
48524 return SDValue();
48525 // If FVal is 1, opposite cond is needed.
48526 needOppositeCond = !needOppositeCond;
48527 FValIsFalse = false;
48528 }
48529 // Quit if TVal is not the constant opposite of FVal.
48530 if (FValIsFalse && TVal->getZExtValue() != 1)
48531 return SDValue();
48532 if (!FValIsFalse && TVal->getZExtValue() != 0)
48533 return SDValue();
48534 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
48535 if (needOppositeCond)
48537 return SetCC.getOperand(3);
48538 }
48539 }
48540
48541 return SDValue();
48542}
48543
48544/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
48545/// Match:
48546/// (X86or (X86setcc) (X86setcc))
48547/// (X86cmp (and (X86setcc) (X86setcc)), 0)
48549 X86::CondCode &CC1, SDValue &Flags,
48550 bool &isAnd) {
48551 if (Cond->getOpcode() == X86ISD::CMP) {
48552 if (!isNullConstant(Cond->getOperand(1)))
48553 return false;
48554
48555 Cond = Cond->getOperand(0);
48556 }
48557
48558 isAnd = false;
48559
48560 SDValue SetCC0, SetCC1;
48561 switch (Cond->getOpcode()) {
48562 default: return false;
48563 case ISD::AND:
48564 case X86ISD::AND:
48565 isAnd = true;
48566 [[fallthrough]];
48567 case ISD::OR:
48568 case X86ISD::OR:
48569 SetCC0 = Cond->getOperand(0);
48570 SetCC1 = Cond->getOperand(1);
48571 break;
48572 };
48573
48574 // Make sure we have SETCC nodes, using the same flags value.
48575 if (SetCC0.getOpcode() != X86ISD::SETCC ||
48576 SetCC1.getOpcode() != X86ISD::SETCC ||
48577 SetCC0->getOperand(1) != SetCC1->getOperand(1))
48578 return false;
48579
48580 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
48581 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
48582 Flags = SetCC0->getOperand(1);
48583 return true;
48584}
48585
48586// When legalizing carry, we create carries via add X, -1
48587// If that comes from an actual carry, via setcc, we use the
48588// carry directly.
48590 if (EFLAGS.getOpcode() == X86ISD::ADD) {
48591 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
48592 bool FoundAndLSB = false;
48593 SDValue Carry = EFLAGS.getOperand(0);
48594 while (Carry.getOpcode() == ISD::TRUNCATE ||
48595 Carry.getOpcode() == ISD::ZERO_EXTEND ||
48596 (Carry.getOpcode() == ISD::AND &&
48597 isOneConstant(Carry.getOperand(1)))) {
48598 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
48599 Carry = Carry.getOperand(0);
48600 }
48601 if (Carry.getOpcode() == X86ISD::SETCC ||
48602 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
48603 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
48604 uint64_t CarryCC = Carry.getConstantOperandVal(0);
48605 SDValue CarryOp1 = Carry.getOperand(1);
48606 if (CarryCC == X86::COND_B)
48607 return CarryOp1;
48608 if (CarryCC == X86::COND_A) {
48609 // Try to convert COND_A into COND_B in an attempt to facilitate
48610 // materializing "setb reg".
48611 //
48612 // Do not flip "e > c", where "c" is a constant, because Cmp
48613 // instruction cannot take an immediate as its first operand.
48614 //
48615 if (CarryOp1.getOpcode() == X86ISD::SUB &&
48616 CarryOp1.getNode()->hasOneUse() &&
48617 CarryOp1.getValueType().isInteger() &&
48618 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
48619 SDValue SubCommute =
48620 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
48621 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
48622 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
48623 }
48624 }
48625 // If this is a check of the z flag of an add with 1, switch to the
48626 // C flag.
48627 if (CarryCC == X86::COND_E &&
48628 CarryOp1.getOpcode() == X86ISD::ADD &&
48629 isOneConstant(CarryOp1.getOperand(1)))
48630 return CarryOp1;
48631 } else if (FoundAndLSB) {
48632 SDLoc DL(Carry);
48633 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
48634 if (Carry.getOpcode() == ISD::SRL) {
48635 BitNo = Carry.getOperand(1);
48636 Carry = Carry.getOperand(0);
48637 }
48638 return getBT(Carry, BitNo, DL, DAG);
48639 }
48640 }
48641 }
48642
48643 return SDValue();
48644}
48645
48646/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
48647/// to avoid the inversion.
48649 SelectionDAG &DAG,
48650 const X86Subtarget &Subtarget) {
48651 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
48652 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
48653 EFLAGS.getOpcode() != X86ISD::TESTP)
48654 return SDValue();
48655
48656 // PTEST/TESTP sets EFLAGS as:
48657 // TESTZ: ZF = (Op0 & Op1) == 0
48658 // TESTC: CF = (~Op0 & Op1) == 0
48659 // TESTNZC: ZF == 0 && CF == 0
48660 MVT VT = EFLAGS.getSimpleValueType();
48661 SDValue Op0 = EFLAGS.getOperand(0);
48662 SDValue Op1 = EFLAGS.getOperand(1);
48663 MVT OpVT = Op0.getSimpleValueType();
48664 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48665
48666 // TEST*(~X,Y) == TEST*(X,Y)
48667 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
48668 X86::CondCode InvCC;
48669 switch (CC) {
48670 case X86::COND_B:
48671 // testc -> testz.
48672 InvCC = X86::COND_E;
48673 break;
48674 case X86::COND_AE:
48675 // !testc -> !testz.
48676 InvCC = X86::COND_NE;
48677 break;
48678 case X86::COND_E:
48679 // testz -> testc.
48680 InvCC = X86::COND_B;
48681 break;
48682 case X86::COND_NE:
48683 // !testz -> !testc.
48684 InvCC = X86::COND_AE;
48685 break;
48686 case X86::COND_A:
48687 case X86::COND_BE:
48688 // testnzc -> testnzc (no change).
48689 InvCC = CC;
48690 break;
48691 default:
48692 InvCC = X86::COND_INVALID;
48693 break;
48694 }
48695
48696 if (InvCC != X86::COND_INVALID) {
48697 CC = InvCC;
48698 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48699 DAG.getBitcast(OpVT, NotOp0), Op1);
48700 }
48701 }
48702
48703 if (CC == X86::COND_B || CC == X86::COND_AE) {
48704 // TESTC(X,~X) == TESTC(X,-1)
48705 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48706 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
48707 SDLoc DL(EFLAGS);
48708 return DAG.getNode(
48709 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
48710 DAG.getBitcast(OpVT,
48711 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
48712 }
48713 }
48714 // PTESTC(PCMPEQ(X,0),-1) == PTESTZ(X,X)
48715 if (EFLAGS.getOpcode() == X86ISD::PTEST &&
48717 SDValue BC0 = peekThroughBitcasts(Op0);
48718 if (BC0.getOpcode() == X86ISD::PCMPEQ &&
48720 SDLoc DL(EFLAGS);
48721 CC = (CC == X86::COND_B ? X86::COND_E : X86::COND_NE);
48722 SDValue X = DAG.getBitcast(OpVT, BC0.getOperand(0));
48723 return DAG.getNode(EFLAGS.getOpcode(), DL, VT, X, X);
48724 }
48725 }
48726 }
48727
48728 if (CC == X86::COND_E || CC == X86::COND_NE) {
48729 // TESTZ(X,~Y) == TESTC(Y,X)
48730 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48731 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48732 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48733 DAG.getBitcast(OpVT, NotOp1), Op0);
48734 }
48735
48736 if (Op0 == Op1) {
48737 SDValue BC = peekThroughBitcasts(Op0);
48738 EVT BCVT = BC.getValueType();
48739
48740 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
48741 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
48742 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48743 DAG.getBitcast(OpVT, BC.getOperand(0)),
48744 DAG.getBitcast(OpVT, BC.getOperand(1)));
48745 }
48746
48747 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
48748 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
48749 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48750 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48751 DAG.getBitcast(OpVT, BC.getOperand(0)),
48752 DAG.getBitcast(OpVT, BC.getOperand(1)));
48753 }
48754
48755 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
48756 // to more efficiently extract the sign bits and compare that.
48757 // TODO: Handle TESTC with comparison inversion.
48758 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
48759 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
48760 if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) {
48761 unsigned EltBits = BCVT.getScalarSizeInBits();
48762 if (DAG.ComputeNumSignBits(BC) == EltBits) {
48763 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
48764 APInt SignMask = APInt::getSignMask(EltBits);
48765 if (SDValue Res =
48766 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
48767 // For vXi16 cases we need to use pmovmksb and extract every other
48768 // sign bit.
48769 SDLoc DL(EFLAGS);
48770 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
48771 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
48772 MVT FloatVT =
48773 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
48774 Res = DAG.getBitcast(FloatVT, Res);
48775 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
48776 } else if (EltBits == 16) {
48777 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
48778 Res = DAG.getBitcast(MovmskVT, Res);
48779 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48780 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
48781 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
48782 } else {
48783 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48784 }
48785 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
48786 DAG.getConstant(0, DL, MVT::i32));
48787 }
48788 }
48789 }
48790 }
48791
48792 // TESTZ(-1,X) == TESTZ(X,X)
48794 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
48795
48796 // TESTZ(X,-1) == TESTZ(X,X)
48798 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
48799
48800 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
48801 // TODO: Add COND_NE handling?
48802 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
48803 SDValue Src0 = peekThroughBitcasts(Op0);
48804 SDValue Src1 = peekThroughBitcasts(Op1);
48805 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
48807 peekThroughBitcasts(Src0.getOperand(1)), true);
48809 peekThroughBitcasts(Src1.getOperand(1)), true);
48810 if (Src0 && Src1) {
48811 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
48812 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48813 DAG.getBitcast(OpVT2, Src0),
48814 DAG.getBitcast(OpVT2, Src1));
48815 }
48816 }
48817 }
48818 }
48819
48820 return SDValue();
48821}
48822
48823// Attempt to simplify the MOVMSK input based on the comparison type.
48825 SelectionDAG &DAG,
48826 const X86Subtarget &Subtarget) {
48827 // Handle eq/ne against zero (any_of).
48828 // Handle eq/ne against -1 (all_of).
48829 if (!(CC == X86::COND_E || CC == X86::COND_NE))
48830 return SDValue();
48831 if (EFLAGS.getValueType() != MVT::i32)
48832 return SDValue();
48833 unsigned CmpOpcode = EFLAGS.getOpcode();
48834 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
48835 return SDValue();
48836 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
48837 if (!CmpConstant)
48838 return SDValue();
48839 const APInt &CmpVal = CmpConstant->getAPIntValue();
48840
48841 SDValue CmpOp = EFLAGS.getOperand(0);
48842 unsigned CmpBits = CmpOp.getValueSizeInBits();
48843 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
48844
48845 // Peek through any truncate.
48846 if (CmpOp.getOpcode() == ISD::TRUNCATE)
48847 CmpOp = CmpOp.getOperand(0);
48848
48849 // Bail if we don't find a MOVMSK.
48850 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
48851 return SDValue();
48852
48853 SDValue Vec = CmpOp.getOperand(0);
48854 MVT VecVT = Vec.getSimpleValueType();
48855 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
48856 "Unexpected MOVMSK operand");
48857 unsigned NumElts = VecVT.getVectorNumElements();
48858 unsigned NumEltBits = VecVT.getScalarSizeInBits();
48859
48860 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
48861 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
48862 NumElts <= CmpBits && CmpVal.isMask(NumElts);
48863 if (!IsAnyOf && !IsAllOf)
48864 return SDValue();
48865
48866 // TODO: Check more combining cases for me.
48867 // Here we check the cmp use number to decide do combining or not.
48868 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
48869 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
48870 bool IsOneUse = CmpOp.getNode()->hasOneUse();
48871
48872 // See if we can peek through to a vector with a wider element type, if the
48873 // signbits extend down to all the sub-elements as well.
48874 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
48875 // potential SimplifyDemandedBits/Elts cases.
48876 // If we looked through a truncate that discard bits, we can't do this
48877 // transform.
48878 // FIXME: We could do this transform for truncates that discarded bits by
48879 // inserting an AND mask between the new MOVMSK and the CMP.
48880 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
48881 SDValue BC = peekThroughBitcasts(Vec);
48882 MVT BCVT = BC.getSimpleValueType();
48883 unsigned BCNumElts = BCVT.getVectorNumElements();
48884 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
48885 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
48886 BCNumEltBits > NumEltBits &&
48887 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
48888 SDLoc DL(EFLAGS);
48889 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
48890 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48891 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
48892 DAG.getConstant(CmpMask, DL, MVT::i32));
48893 }
48894 }
48895
48896 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
48897 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
48898 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
48899 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
48900 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
48902 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
48903 Ops.size() == 2) {
48904 SDLoc DL(EFLAGS);
48905 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
48906 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
48907 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
48908 DAG.getBitcast(SubVT, Ops[0]),
48909 DAG.getBitcast(SubVT, Ops[1]));
48910 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
48911 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48912 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
48913 DAG.getConstant(CmpMask, DL, MVT::i32));
48914 }
48915 }
48916
48917 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
48918 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
48919 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
48920 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
48921 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
48922 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
48923 SDValue BC = peekThroughBitcasts(Vec);
48924 // Ensure MOVMSK was testing every signbit of BC.
48925 if (BC.getValueType().getVectorNumElements() <= NumElts) {
48926 if (BC.getOpcode() == X86ISD::PCMPEQ) {
48927 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
48928 BC.getOperand(0), BC.getOperand(1));
48929 V = DAG.getBitcast(TestVT, V);
48930 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48931 }
48932 // Check for 256-bit split vector cases.
48933 if (BC.getOpcode() == ISD::AND &&
48934 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
48935 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
48936 SDValue LHS = BC.getOperand(0);
48937 SDValue RHS = BC.getOperand(1);
48938 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
48939 LHS.getOperand(0), LHS.getOperand(1));
48940 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
48941 RHS.getOperand(0), RHS.getOperand(1));
48942 LHS = DAG.getBitcast(TestVT, LHS);
48943 RHS = DAG.getBitcast(TestVT, RHS);
48944 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
48945 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48946 }
48947 }
48948 }
48949
48950 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
48951 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
48952 // sign bits prior to the comparison with zero unless we know that
48953 // the vXi16 splats the sign bit down to the lower i8 half.
48954 // TODO: Handle all_of patterns.
48955 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
48956 SDValue VecOp0 = Vec.getOperand(0);
48957 SDValue VecOp1 = Vec.getOperand(1);
48958 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
48959 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
48960 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
48961 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
48962 SDLoc DL(EFLAGS);
48963 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
48964 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
48965 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
48966 if (!SignExt0) {
48967 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
48968 DAG.getConstant(0xAAAA, DL, MVT::i16));
48969 }
48970 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
48971 DAG.getConstant(0, DL, MVT::i16));
48972 }
48973 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
48974 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
48975 if (CmpBits >= 16 && Subtarget.hasInt256() &&
48976 (IsAnyOf || (SignExt0 && SignExt1))) {
48977 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
48978 SDLoc DL(EFLAGS);
48979 SDValue Result = peekThroughBitcasts(Src);
48980 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
48981 Result.getValueType().getVectorNumElements() <= NumElts) {
48982 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
48983 Result.getOperand(0), Result.getOperand(1));
48984 V = DAG.getBitcast(MVT::v4i64, V);
48985 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48986 }
48987 Result = DAG.getBitcast(MVT::v32i8, Result);
48988 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
48989 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
48990 if (!SignExt0 || !SignExt1) {
48991 assert(IsAnyOf &&
48992 "Only perform v16i16 signmasks for any_of patterns");
48993 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
48994 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
48995 }
48996 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
48997 DAG.getConstant(CmpMask, DL, MVT::i32));
48998 }
48999 }
49000 }
49001
49002 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
49003 // Since we peek through a bitcast, we need to be careful if the base vector
49004 // type has smaller elements than the MOVMSK type. In that case, even if
49005 // all the elements are demanded by the shuffle mask, only the "high"
49006 // elements which have highbits that align with highbits in the MOVMSK vec
49007 // elements are actually demanded. A simplification of spurious operations
49008 // on the "low" elements take place during other simplifications.
49009 //
49010 // For example:
49011 // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
49012 // demanded, because we are swapping around the result can change.
49013 //
49014 // To address this, we check that we can scale the shuffle mask to MOVMSK
49015 // element width (this will ensure "high" elements match). Its slightly overly
49016 // conservative, but fine for an edge case fold.
49017 SmallVector<int, 32> ShuffleMask;
49018 SmallVector<SDValue, 2> ShuffleInputs;
49019 if (NumElts <= CmpBits &&
49020 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
49021 ShuffleMask, DAG) &&
49022 ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) &&
49023 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
49024 canScaleShuffleElements(ShuffleMask, NumElts)) {
49025 SDLoc DL(EFLAGS);
49026 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
49027 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49028 Result =
49029 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
49030 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, EFLAGS.getOperand(1));
49031 }
49032
49033 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
49034 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
49035 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
49036 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
49037 // iff every element is referenced.
49038 if (NumElts <= CmpBits && Subtarget.hasAVX() &&
49039 !Subtarget.preferMovmskOverVTest() && IsOneUse &&
49040 (NumEltBits == 32 || NumEltBits == 64)) {
49041 SDLoc DL(EFLAGS);
49042 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
49043 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
49044 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
49045 SDValue LHS = Vec;
49046 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
49047 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
49048 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
49049 DAG.getBitcast(FloatVT, LHS),
49050 DAG.getBitcast(FloatVT, RHS));
49051 }
49052
49053 return SDValue();
49054}
49055
49056/// Optimize an EFLAGS definition used according to the condition code \p CC
49057/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
49058/// uses of chain values.
49060 SelectionDAG &DAG,
49061 const X86Subtarget &Subtarget) {
49062 if (CC == X86::COND_B)
49063 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
49064 return Flags;
49065
49066 if (SDValue R = checkSignTestSetCCCombine(EFLAGS, CC, DAG))
49067 return R;
49068
49069 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
49070 return R;
49071
49072 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
49073 return R;
49074
49075 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
49076 return R;
49077
49078 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
49079}
49080
49081/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
49084 const X86Subtarget &Subtarget) {
49085 SDLoc DL(N);
49086 EVT VT = N->getValueType(0);
49087 SDValue FalseOp = N->getOperand(0);
49088 SDValue TrueOp = N->getOperand(1);
49089 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
49090 SDValue Cond = N->getOperand(3);
49091
49092 // cmov X, X, ?, ? --> X
49093 if (TrueOp == FalseOp)
49094 return TrueOp;
49095
49096 // Try to simplify the EFLAGS and condition code operands.
49097 // We can't always do this as FCMOV only supports a subset of X86 cond.
49098 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
49099 if (!(FalseOp.getValueType() == MVT::f80 ||
49100 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
49101 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
49102 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
49103 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
49104 Flags};
49105 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49106 }
49107 }
49108
49109 // If this is a select between two integer constants, try to do some
49110 // optimizations. Note that the operands are ordered the opposite of SELECT
49111 // operands.
49112 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
49113 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
49114 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
49115 // larger than FalseC (the false value).
49116 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
49118 std::swap(TrueC, FalseC);
49119 std::swap(TrueOp, FalseOp);
49120 }
49121
49122 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
49123 // This is efficient for any integer data type (including i8/i16) and
49124 // shift amount.
49125 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
49126 Cond = getSETCC(CC, Cond, DL, DAG);
49127
49128 // Zero extend the condition if needed.
49129 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
49130
49131 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
49132 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
49133 DAG.getConstant(ShAmt, DL, MVT::i8));
49134 return Cond;
49135 }
49136
49137 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
49138 // for any integer data type, including i8/i16.
49139 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
49140 Cond = getSETCC(CC, Cond, DL, DAG);
49141
49142 // Zero extend the condition if needed.
49144 FalseC->getValueType(0), Cond);
49145 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49146 SDValue(FalseC, 0));
49147 return Cond;
49148 }
49149
49150 // Optimize cases that will turn into an LEA instruction. This requires
49151 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
49152 if (VT == MVT::i32 || VT == MVT::i64) {
49153 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
49154 assert(Diff.getBitWidth() == VT.getSizeInBits() &&
49155 "Implicit constant truncation");
49156
49157 bool isFastMultiplier = false;
49158 if (Diff.ult(10)) {
49159 switch (Diff.getZExtValue()) {
49160 default: break;
49161 case 1: // result = add base, cond
49162 case 2: // result = lea base( , cond*2)
49163 case 3: // result = lea base(cond, cond*2)
49164 case 4: // result = lea base( , cond*4)
49165 case 5: // result = lea base(cond, cond*4)
49166 case 8: // result = lea base( , cond*8)
49167 case 9: // result = lea base(cond, cond*8)
49168 isFastMultiplier = true;
49169 break;
49170 }
49171 }
49172
49173 if (isFastMultiplier) {
49174 Cond = getSETCC(CC, Cond, DL ,DAG);
49175 // Zero extend the condition if needed.
49176 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
49177 Cond);
49178 // Scale the condition by the difference.
49179 if (Diff != 1)
49180 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
49181 DAG.getConstant(Diff, DL, Cond.getValueType()));
49182
49183 // Add the base if non-zero.
49184 if (FalseC->getAPIntValue() != 0)
49185 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49186 SDValue(FalseC, 0));
49187 return Cond;
49188 }
49189 }
49190 }
49191 }
49192
49193 // Handle these cases:
49194 // (select (x != c), e, c) -> select (x != c), e, x),
49195 // (select (x == c), c, e) -> select (x == c), x, e)
49196 // where the c is an integer constant, and the "select" is the combination
49197 // of CMOV and CMP.
49198 //
49199 // The rationale for this change is that the conditional-move from a constant
49200 // needs two instructions, however, conditional-move from a register needs
49201 // only one instruction.
49202 //
49203 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
49204 // some instruction-combining opportunities. This opt needs to be
49205 // postponed as late as possible.
49206 //
49207 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
49208 // the DCI.xxxx conditions are provided to postpone the optimization as
49209 // late as possible.
49210
49211 ConstantSDNode *CmpAgainst = nullptr;
49212 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
49213 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
49214 !isa<ConstantSDNode>(Cond.getOperand(0))) {
49215
49216 if (CC == X86::COND_NE &&
49217 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
49219 std::swap(TrueOp, FalseOp);
49220 }
49221
49222 if (CC == X86::COND_E && CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
49223 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
49224 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
49225 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49226 }
49227 }
49228 }
49229
49230 // Transform:
49231 //
49232 // (cmov 1 T (uge T 2))
49233 //
49234 // to:
49235 //
49236 // (adc T 0 (sub T 1))
49237 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
49238 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
49239 SDValue Cond0 = Cond.getOperand(0);
49240 if (Cond0.getOpcode() == ISD::TRUNCATE)
49241 Cond0 = Cond0.getOperand(0);
49242 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
49243 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
49244 EVT CondVT = Cond->getValueType(0);
49245 // Subtract 1 and generate a carry.
49246 SDValue NewSub =
49247 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
49248 DAG.getConstant(1, DL, CondVT));
49249 SDValue EFLAGS(NewSub.getNode(), 1);
49250 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(VT, MVT::i32), TrueOp,
49251 DAG.getConstant(0, DL, VT), EFLAGS);
49252 }
49253 }
49254
49255 // Fold and/or of setcc's to double CMOV:
49256 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
49257 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
49258 //
49259 // This combine lets us generate:
49260 // cmovcc1 (jcc1 if we don't have CMOV)
49261 // cmovcc2 (same)
49262 // instead of:
49263 // setcc1
49264 // setcc2
49265 // and/or
49266 // cmovne (jne if we don't have CMOV)
49267 // When we can't use the CMOV instruction, it might increase branch
49268 // mispredicts.
49269 // When we can use CMOV, or when there is no mispredict, this improves
49270 // throughput and reduces register pressure.
49271 //
49272 if (CC == X86::COND_NE) {
49273 SDValue Flags;
49274 X86::CondCode CC0, CC1;
49275 bool isAndSetCC;
49276 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
49277 if (isAndSetCC) {
49278 std::swap(FalseOp, TrueOp);
49281 }
49282
49283 SDValue LOps[] = {FalseOp, TrueOp,
49284 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
49285 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, VT, LOps);
49286 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
49287 Flags};
49288 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49289 return CMOV;
49290 }
49291 }
49292
49293 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
49294 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
49295 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
49296 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
49297 // Or (CMOV (BSR ?, X), Y, (X == 0)) -> (BSR Y, X)
49298 // TODO: Or (CMOV (BSF ?, X), Y, (X == 0)) -> (BSF Y, X)
49299 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
49300 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
49301 SDValue Add = TrueOp;
49302 SDValue Const = FalseOp;
49303 // Canonicalize the condition code for easier matching and output.
49304 if (CC == X86::COND_E)
49305 std::swap(Add, Const);
49306
49307 // TODO: ADD BSF support, but requires changes to the "REP BSF" CTTZ hack.
49308 if (Subtarget.hasBitScanPassThrough() && Add.getOpcode() == X86ISD::BSR &&
49309 Add.getResNo() == 0 && Add.hasOneUse() &&
49310 Add.getOperand(1) == Cond.getOperand(0)) {
49311 return DAG.getNode(Add.getOpcode(), DL, Add->getVTList(), Const,
49312 Add.getOperand(1));
49313 }
49314
49315 // We might have replaced the constant in the cmov with the LHS of the
49316 // compare. If so change it to the RHS of the compare.
49317 if (Const == Cond.getOperand(0))
49318 Const = Cond.getOperand(1);
49319
49320 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
49321 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
49322 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
49323 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
49324 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
49325 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
49326 // This should constant fold.
49327 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
49328 SDValue CMov =
49329 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
49330 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
49331 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
49332 }
49333 }
49334
49335 return SDValue();
49336}
49337
49338/// Different mul shrinking modes.
49340
49342 EVT VT = N->getOperand(0).getValueType();
49343 if (VT.getScalarSizeInBits() != 32)
49344 return false;
49345
49346 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
49347 unsigned SignBits[2] = {1, 1};
49348 bool IsPositive[2] = {false, false};
49349 for (unsigned i = 0; i < 2; i++) {
49350 SDValue Opd = N->getOperand(i);
49351
49352 SignBits[i] = DAG.ComputeNumSignBits(Opd);
49353 IsPositive[i] = DAG.SignBitIsZero(Opd);
49354 }
49355
49356 bool AllPositive = IsPositive[0] && IsPositive[1];
49357 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
49358 // When ranges are from -128 ~ 127, use MULS8 mode.
49359 if (MinSignBits >= 25)
49361 // When ranges are from 0 ~ 255, use MULU8 mode.
49362 else if (AllPositive && MinSignBits >= 24)
49364 // When ranges are from -32768 ~ 32767, use MULS16 mode.
49365 else if (MinSignBits >= 17)
49367 // When ranges are from 0 ~ 65535, use MULU16 mode.
49368 else if (AllPositive && MinSignBits >= 16)
49370 else
49371 return false;
49372 return true;
49373}
49374
49375/// When the operands of vector mul are extended from smaller size values,
49376/// like i8 and i16, the type of mul may be shrinked to generate more
49377/// efficient code. Two typical patterns are handled:
49378/// Pattern1:
49379/// %2 = sext/zext <N x i8> %1 to <N x i32>
49380/// %4 = sext/zext <N x i8> %3 to <N x i32>
49381// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49382/// %5 = mul <N x i32> %2, %4
49383///
49384/// Pattern2:
49385/// %2 = zext/sext <N x i16> %1 to <N x i32>
49386/// %4 = zext/sext <N x i16> %3 to <N x i32>
49387/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49388/// %5 = mul <N x i32> %2, %4
49389///
49390/// There are four mul shrinking modes:
49391/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
49392/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
49393/// generate pmullw+sext32 for it (MULS8 mode).
49394/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
49395/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
49396/// generate pmullw+zext32 for it (MULU8 mode).
49397/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
49398/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
49399/// generate pmullw+pmulhw for it (MULS16 mode).
49400/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
49401/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
49402/// generate pmullw+pmulhuw for it (MULU16 mode).
49404 const X86Subtarget &Subtarget) {
49405 // Check for legality
49406 // pmullw/pmulhw are not supported by SSE.
49407 if (!Subtarget.hasSSE2())
49408 return SDValue();
49409
49410 // Check for profitability
49411 // pmulld is supported since SSE41. It is better to use pmulld
49412 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
49413 // the expansion.
49414 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
49415 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
49416 return SDValue();
49417
49419 if (!canReduceVMulWidth(N, DAG, Mode))
49420 return SDValue();
49421
49422 SDValue N0 = N->getOperand(0);
49423 SDValue N1 = N->getOperand(1);
49424 EVT VT = N->getOperand(0).getValueType();
49425 unsigned NumElts = VT.getVectorNumElements();
49426 if ((NumElts % 2) != 0)
49427 return SDValue();
49428
49429 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
49430
49431 // Shrink the operands of mul.
49432 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
49433 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
49434
49435 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
49436 // lower part is needed.
49437 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
49441 DL, VT, MulLo);
49442
49443 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
49444 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
49445 // the higher part is also needed.
49446 SDValue MulHi =
49448 ReducedVT, NewN0, NewN1);
49449
49450 // Repack the lower part and higher part result of mul into a wider
49451 // result.
49452 // Generate shuffle functioning as punpcklwd.
49453 SmallVector<int, 16> ShuffleMask(NumElts);
49454 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49455 ShuffleMask[2 * i] = i;
49456 ShuffleMask[2 * i + 1] = i + NumElts;
49457 }
49458 SDValue ResLo =
49459 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49460 ResLo = DAG.getBitcast(ResVT, ResLo);
49461 // Generate shuffle functioning as punpckhwd.
49462 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49463 ShuffleMask[2 * i] = i + NumElts / 2;
49464 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
49465 }
49466 SDValue ResHi =
49467 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49468 ResHi = DAG.getBitcast(ResVT, ResHi);
49469 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
49470}
49471
49473 EVT VT, const SDLoc &DL) {
49474
49475 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
49476 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49477 DAG.getConstant(Mult, DL, VT));
49478 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
49479 DAG.getConstant(Shift, DL, MVT::i8));
49480 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49481 N->getOperand(0));
49482 return Result;
49483 };
49484
49485 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
49486 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49487 DAG.getConstant(Mul1, DL, VT));
49488 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
49489 DAG.getConstant(Mul2, DL, VT));
49490 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49491 N->getOperand(0));
49492 return Result;
49493 };
49494
49495 switch (MulAmt) {
49496 default:
49497 break;
49498 case 11:
49499 // mul x, 11 => add ((shl (mul x, 5), 1), x)
49500 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
49501 case 21:
49502 // mul x, 21 => add ((shl (mul x, 5), 2), x)
49503 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
49504 case 41:
49505 // mul x, 41 => add ((shl (mul x, 5), 3), x)
49506 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
49507 case 22:
49508 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
49509 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49510 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
49511 case 19:
49512 // mul x, 19 => add ((shl (mul x, 9), 1), x)
49513 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
49514 case 37:
49515 // mul x, 37 => add ((shl (mul x, 9), 2), x)
49516 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
49517 case 73:
49518 // mul x, 73 => add ((shl (mul x, 9), 3), x)
49519 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
49520 case 13:
49521 // mul x, 13 => add ((shl (mul x, 3), 2), x)
49522 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
49523 case 23:
49524 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
49525 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
49526 case 26:
49527 // mul x, 26 => add ((mul (mul x, 5), 5), x)
49528 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
49529 case 28:
49530 // mul x, 28 => add ((mul (mul x, 9), 3), x)
49531 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
49532 case 29:
49533 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
49534 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49535 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
49536 }
49537
49538 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
49539 // by a single LEA.
49540 // First check if this a sum of two power of 2s because that's easy. Then
49541 // count how many zeros are up to the first bit.
49542 // TODO: We can do this even without LEA at a cost of two shifts and an add.
49543 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
49544 unsigned ScaleShift = llvm::countr_zero(MulAmt);
49545 if (ScaleShift >= 1 && ScaleShift < 4) {
49546 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
49547 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49548 DAG.getConstant(ShiftAmt, DL, MVT::i8));
49549 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49550 DAG.getConstant(ScaleShift, DL, MVT::i8));
49551 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
49552 }
49553 }
49554
49555 return SDValue();
49556}
49557
49558// If the upper 17 bits of either element are zero and the other element are
49559// zero/sign bits then we can use PMADDWD, which is always at least as quick as
49560// PMULLD, except on KNL.
49562 SelectionDAG &DAG,
49563 const X86Subtarget &Subtarget) {
49564 if (!Subtarget.hasSSE2())
49565 return SDValue();
49566
49567 if (Subtarget.isPMADDWDSlow())
49568 return SDValue();
49569
49570 EVT VT = N->getValueType(0);
49571
49572 // Only support vXi32 vectors.
49573 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
49574 return SDValue();
49575
49576 // Make sure the type is legal or can split/widen to a legal type.
49577 // With AVX512 but without BWI, we would need to split v32i16.
49578 unsigned NumElts = VT.getVectorNumElements();
49579 if (NumElts == 1 || !isPowerOf2_32(NumElts))
49580 return SDValue();
49581
49582 // With AVX512 but without BWI, we would need to split v32i16.
49583 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
49584 return SDValue();
49585
49586 SDValue N0 = N->getOperand(0);
49587 SDValue N1 = N->getOperand(1);
49588
49589 // If we are zero/sign extending two steps without SSE4.1, its better to
49590 // reduce the vmul width instead.
49591 if (!Subtarget.hasSSE41() &&
49592 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
49593 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49594 (N1.getOpcode() == ISD::ZERO_EXTEND &&
49595 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
49596 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
49597 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49598 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49599 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
49600 return SDValue();
49601
49602 // If we are sign extending a wide vector without SSE4.1, its better to reduce
49603 // the vmul width instead.
49604 if (!Subtarget.hasSSE41() &&
49605 (N0.getOpcode() == ISD::SIGN_EXTEND &&
49606 N0.getOperand(0).getValueSizeInBits() > 128) &&
49607 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49608 N1.getOperand(0).getValueSizeInBits() > 128))
49609 return SDValue();
49610
49611 // Sign bits must extend down to the lowest i16.
49612 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
49613 DAG.ComputeMaxSignificantBits(N0) > 16)
49614 return SDValue();
49615
49616 // At least one of the elements must be zero in the upper 17 bits, or can be
49617 // safely made zero without altering the final result.
49618 auto GetZeroableOp = [&](SDValue Op) {
49619 APInt Mask17 = APInt::getHighBitsSet(32, 17);
49620 if (DAG.MaskedValueIsZero(Op, Mask17))
49621 return Op;
49622 // Mask off upper 16-bits of sign-extended constants.
49624 return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(0xFFFF, DL, VT));
49625 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
49626 SDValue Src = Op.getOperand(0);
49627 // Convert sext(vXi16) to zext(vXi16).
49628 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
49629 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49630 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
49631 // which will expand the extension.
49632 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
49633 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
49634 Src = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, Src);
49635 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49636 }
49637 }
49638 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
49639 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
49640 N->isOnlyUserOf(Op.getNode())) {
49641 SDValue Src = Op.getOperand(0);
49642 if (Src.getScalarValueSizeInBits() == 16)
49643 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Src);
49644 }
49645 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
49646 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
49647 N->isOnlyUserOf(Op.getNode())) {
49648 return DAG.getNode(X86ISD::VSRLI, DL, VT, Op.getOperand(0),
49649 Op.getOperand(1));
49650 }
49651 return SDValue();
49652 };
49653 SDValue ZeroN0 = GetZeroableOp(N0);
49654 SDValue ZeroN1 = GetZeroableOp(N1);
49655 if (!ZeroN0 && !ZeroN1)
49656 return SDValue();
49657 N0 = ZeroN0 ? ZeroN0 : N0;
49658 N1 = ZeroN1 ? ZeroN1 : N1;
49659
49660 // Use SplitOpsAndApply to handle AVX splitting.
49661 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49663 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
49664 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
49665 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
49666 DAG.getBitcast(OpVT, Ops[0]),
49667 DAG.getBitcast(OpVT, Ops[1]));
49668 };
49669 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDWDBuilder);
49670}
49671
49673 const X86Subtarget &Subtarget) {
49674 if (!Subtarget.hasSSE2())
49675 return SDValue();
49676
49677 EVT VT = N->getValueType(0);
49678
49679 // Only support vXi64 vectors.
49680 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
49681 VT.getVectorNumElements() < 2 ||
49683 return SDValue();
49684
49685 SDValue N0 = N->getOperand(0);
49686 SDValue N1 = N->getOperand(1);
49687
49688 // MULDQ returns the 64-bit result of the signed multiplication of the lower
49689 // 32-bits. We can lower with this if the sign bits stretch that far.
49690 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
49691 DAG.ComputeNumSignBits(N1) > 32) {
49692 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49694 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
49695 };
49696 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULDQBuilder,
49697 /*CheckBWI*/ false);
49698 }
49699
49700 // If the upper bits are zero we can use a single pmuludq.
49701 APInt Mask = APInt::getHighBitsSet(64, 32);
49702 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
49703 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49705 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
49706 };
49707 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULUDQBuilder,
49708 /*CheckBWI*/ false);
49709 }
49710
49711 return SDValue();
49712}
49713
49716 const X86Subtarget &Subtarget) {
49717 EVT VT = N->getValueType(0);
49718 SDLoc DL(N);
49719
49720 if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget))
49721 return V;
49722
49723 if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))
49724 return V;
49725
49726 if (DCI.isBeforeLegalize() && VT.isVector())
49727 return reduceVMULWidth(N, DL, DAG, Subtarget);
49728
49729 if (VT != MVT::i64 && VT != MVT::i32 &&
49730 (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
49731 return SDValue();
49732
49733 KnownBits Known1 = DAG.computeKnownBits(N->getOperand(1));
49734 if (!Known1.isConstant())
49735 return SDValue();
49736
49737 const APInt &C = Known1.getConstant();
49738 if (C.isZero())
49739 return DAG.getConstant(0, DL, VT);
49740
49741 if (C.isAllOnes())
49742 return DAG.getNegative(N->getOperand(0), DL, VT);
49743
49744 if (isPowerOf2_64(C.getZExtValue()))
49745 return SDValue();
49746
49747 // Optimize a single multiply with constant into two operations in order to
49748 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
49750 return SDValue();
49751
49752 // An imul is usually smaller than the alternative sequence.
49754 return SDValue();
49755
49756 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
49757 return SDValue();
49758
49759 int64_t SignMulAmt = C.getSExtValue();
49760 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
49761 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
49762
49763 SDValue NewMul = SDValue();
49764 if (VT == MVT::i64 || VT == MVT::i32) {
49765 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
49766 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49767 DAG.getConstant(AbsMulAmt, DL, VT));
49768 if (SignMulAmt < 0)
49769 NewMul = DAG.getNegative(NewMul, DL, VT);
49770
49771 return NewMul;
49772 }
49773
49774 uint64_t MulAmt1 = 0;
49775 uint64_t MulAmt2 = 0;
49776 if ((AbsMulAmt % 9) == 0) {
49777 MulAmt1 = 9;
49778 MulAmt2 = AbsMulAmt / 9;
49779 } else if ((AbsMulAmt % 5) == 0) {
49780 MulAmt1 = 5;
49781 MulAmt2 = AbsMulAmt / 5;
49782 } else if ((AbsMulAmt % 3) == 0) {
49783 MulAmt1 = 3;
49784 MulAmt2 = AbsMulAmt / 3;
49785 }
49786
49787 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
49788 if (MulAmt2 &&
49789 (isPowerOf2_64(MulAmt2) ||
49790 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
49791
49792 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
49793 N->user_begin()->getOpcode() == ISD::ADD))
49794 // If second multiplifer is pow2, issue it first. We want the multiply
49795 // by 3, 5, or 9 to be folded into the addressing mode unless the lone
49796 // use is an add. Only do this for positive multiply amounts since the
49797 // negate would prevent it from being used as an address mode anyway.
49798 std::swap(MulAmt1, MulAmt2);
49799
49800 if (isPowerOf2_64(MulAmt1))
49801 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49802 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
49803 else
49804 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49805 DAG.getConstant(MulAmt1, DL, VT));
49806
49807 if (isPowerOf2_64(MulAmt2))
49808 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
49809 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
49810 else
49811 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
49812 DAG.getConstant(MulAmt2, DL, VT));
49813
49814 // Negate the result.
49815 if (SignMulAmt < 0)
49816 NewMul = DAG.getNegative(NewMul, DL, VT);
49817 } else if (!Subtarget.slowLEA())
49818 NewMul = combineMulSpecial(C.getZExtValue(), N, DAG, VT, DL);
49819 }
49820 if (!NewMul) {
49821 EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
49822 if (isPowerOf2_64(AbsMulAmt - 1)) {
49823 // (mul x, 2^N + 1) => (add (shl x, N), x)
49824 NewMul = DAG.getNode(
49825 ISD::ADD, DL, VT, N->getOperand(0),
49826 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49827 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
49828 if (SignMulAmt < 0)
49829 NewMul = DAG.getNegative(NewMul, DL, VT);
49830 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
49831 // (mul x, 2^N - 1) => (sub (shl x, N), x)
49832 NewMul =
49833 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49834 DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
49835 // To negate, reverse the operands of the subtract.
49836 if (SignMulAmt < 0)
49837 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
49838 else
49839 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
49840 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
49841 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49842 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
49843 NewMul =
49844 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49845 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
49846 NewMul = DAG.getNode(
49847 ISD::ADD, DL, VT, NewMul,
49848 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49849 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
49850 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49851 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
49852 NewMul =
49853 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49854 DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
49855 NewMul = DAG.getNode(
49856 ISD::SUB, DL, VT, NewMul,
49857 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49858 } else if (SignMulAmt >= 0 && VT.isVector() &&
49859 Subtarget.fastImmVectorShift()) {
49860 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
49861 uint64_t ShiftAmt1;
49862 std::optional<unsigned> Opc;
49863 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
49864 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
49865 Opc = ISD::ADD;
49866 } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
49867 ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
49868 Opc = ISD::SUB;
49869 }
49870
49871 if (Opc) {
49872 SDValue Shift1 =
49873 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49874 DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
49875 SDValue Shift2 =
49876 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49877 DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
49878 NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
49879 }
49880 }
49881 }
49882
49883 return NewMul;
49884}
49885
49886// Try to form a MULHU or MULHS node by looking for
49887// (srl (mul ext, ext), 16)
49888// TODO: This is X86 specific because we want to be able to handle wide types
49889// before type legalization. But we can only do it if the vector will be
49890// legalized via widening/splitting. Type legalization can't handle promotion
49891// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
49892// combiner.
49894 const SDLoc &DL,
49895 const X86Subtarget &Subtarget) {
49896 using namespace SDPatternMatch;
49897 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
49898 "SRL or SRA node is required here!");
49899
49900 if (!Subtarget.hasSSE2())
49901 return SDValue();
49902
49903 // Input type should be at least vXi32.
49904 EVT VT = N->getValueType(0);
49905 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
49906 return SDValue();
49907
49908 // The operation must be a multiply shifted right by 16.
49909 SDValue LHS, RHS;
49910 if (!sd_match(N->getOperand(1), m_SpecificInt(16)) ||
49911 !sd_match(N->getOperand(0), m_OneUse(m_Mul(m_Value(LHS), m_Value(RHS)))))
49912 return SDValue();
49913
49914 unsigned ExtOpc = LHS.getOpcode();
49915 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
49916 RHS.getOpcode() != ExtOpc)
49917 return SDValue();
49918
49919 // Peek through the extends.
49920 LHS = LHS.getOperand(0);
49921 RHS = RHS.getOperand(0);
49922
49923 // Ensure the input types match.
49924 EVT MulVT = LHS.getValueType();
49925 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
49926 return SDValue();
49927
49928 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
49929 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
49930
49931 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
49932 return DAG.getNode(ExtOpc, DL, VT, Mulh);
49933}
49934
49936 const X86Subtarget &Subtarget) {
49937 using namespace llvm::SDPatternMatch;
49938 SDValue N0 = N->getOperand(0);
49939 SDValue N1 = N->getOperand(1);
49941 EVT VT = N0.getValueType();
49942 unsigned EltSizeInBits = VT.getScalarSizeInBits();
49943 SDLoc DL(N);
49944
49945 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
49946 // with out-of-bounds clamping.
49947 if (N0.getOpcode() == ISD::VSELECT &&
49948 supportedVectorVarShift(VT, Subtarget, ISD::SHL)) {
49949 SDValue Cond = N0.getOperand(0);
49950 SDValue N00 = N0.getOperand(1);
49951 SDValue N01 = N0.getOperand(2);
49952 // fold shl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psllv(x,amt)
49954 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
49956 return DAG.getNode(X86ISD::VSHLV, DL, VT, N00, N1);
49957 }
49958 // fold shl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psllv(x,amt)
49960 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
49962 return DAG.getNode(X86ISD::VSHLV, DL, VT, N01, N1);
49963 }
49964 }
49965
49966 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
49967 // since the result of setcc_c is all zero's or all ones.
49968 if (VT.isInteger() && !VT.isVector() &&
49969 N1C && N0.getOpcode() == ISD::AND &&
49970 N0.getOperand(1).getOpcode() == ISD::Constant) {
49971 SDValue N00 = N0.getOperand(0);
49972 APInt Mask = N0.getConstantOperandAPInt(1);
49973 Mask <<= N1C->getAPIntValue();
49974 bool MaskOK = false;
49975 // We can handle cases concerning bit-widening nodes containing setcc_c if
49976 // we carefully interrogate the mask to make sure we are semantics
49977 // preserving.
49978 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
49979 // of the underlying setcc_c operation if the setcc_c was zero extended.
49980 // Consider the following example:
49981 // zext(setcc_c) -> i32 0x0000FFFF
49982 // c1 -> i32 0x0000FFFF
49983 // c2 -> i32 0x00000001
49984 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
49985 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
49986 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
49987 MaskOK = true;
49988 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
49990 MaskOK = true;
49991 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
49992 N00.getOpcode() == ISD::ANY_EXTEND) &&
49994 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
49995 }
49996 if (MaskOK && Mask != 0)
49997 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
49998 }
49999
50000 return SDValue();
50001}
50002
50004 const X86Subtarget &Subtarget) {
50005 using namespace llvm::SDPatternMatch;
50006 SDValue N0 = N->getOperand(0);
50007 SDValue N1 = N->getOperand(1);
50008 EVT VT = N0.getValueType();
50009 unsigned Size = VT.getSizeInBits();
50010 SDLoc DL(N);
50011
50012 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50013 return V;
50014
50015 // fold sra(x,umin(amt,bw-1)) -> avx2 psrav(x,amt)
50016 if (supportedVectorVarShift(VT, Subtarget, ISD::SRA)) {
50017 SDValue ShrAmtVal;
50018 if (sd_match(N1, m_UMin(m_Value(ShrAmtVal),
50020 return DAG.getNode(X86ISD::VSRAV, DL, VT, N0, ShrAmtVal);
50021 }
50022
50023 // fold (SRA (SHL X, ShlConst), SraConst)
50024 // into (SHL (sext_in_reg X), ShlConst - SraConst)
50025 // or (sext_in_reg X)
50026 // or (SRA (sext_in_reg X), SraConst - ShlConst)
50027 // depending on relation between SraConst and ShlConst.
50028 // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
50029 // us to do the sext_in_reg from corresponding bit.
50030
50031 // sexts in X86 are MOVs. The MOVs have the same code size
50032 // as above SHIFTs (only SHIFT on 1 has lower code size).
50033 // However the MOVs have 2 advantages to a SHIFT:
50034 // 1. MOVs can write to a register that differs from source
50035 // 2. MOVs accept memory operands
50036
50037 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
50038 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
50040 return SDValue();
50041
50042 SDValue N00 = N0.getOperand(0);
50043 SDValue N01 = N0.getOperand(1);
50044 APInt ShlConst = N01->getAsAPIntVal();
50045 APInt SraConst = N1->getAsAPIntVal();
50046 EVT CVT = N1.getValueType();
50047
50048 if (CVT != N01.getValueType())
50049 return SDValue();
50050 if (SraConst.isNegative())
50051 return SDValue();
50052
50053 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
50054 unsigned ShiftSize = SVT.getSizeInBits();
50055 // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
50056 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
50057 continue;
50058 SDValue NN =
50059 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
50060 if (SraConst.eq(ShlConst))
50061 return NN;
50062 if (SraConst.ult(ShlConst))
50063 return DAG.getNode(ISD::SHL, DL, VT, NN,
50064 DAG.getConstant(ShlConst - SraConst, DL, CVT));
50065 return DAG.getNode(ISD::SRA, DL, VT, NN,
50066 DAG.getConstant(SraConst - ShlConst, DL, CVT));
50067 }
50068 return SDValue();
50069}
50070
50073 const X86Subtarget &Subtarget) {
50074 using namespace llvm::SDPatternMatch;
50075 SDValue N0 = N->getOperand(0);
50076 SDValue N1 = N->getOperand(1);
50077 EVT VT = N0.getValueType();
50078 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50079 SDLoc DL(N);
50080
50081 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50082 return V;
50083
50084 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
50085 // with out-of-bounds clamping.
50086 if (N0.getOpcode() == ISD::VSELECT &&
50087 supportedVectorVarShift(VT, Subtarget, ISD::SRL)) {
50088 SDValue Cond = N0.getOperand(0);
50089 SDValue N00 = N0.getOperand(1);
50090 SDValue N01 = N0.getOperand(2);
50091 // fold srl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psrlv(x,amt)
50093 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50095 return DAG.getNode(X86ISD::VSRLV, DL, VT, N00, N1);
50096 }
50097 // fold srl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psrlv(x,amt)
50099 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50101 return DAG.getNode(X86ISD::VSRLV, DL, VT, N01, N1);
50102 }
50103 }
50104
50105 // Only do this on the last DAG combine as it can interfere with other
50106 // combines.
50107 if (!DCI.isAfterLegalizeDAG())
50108 return SDValue();
50109
50110 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
50111 // TODO: This is a generic DAG combine that became an x86-only combine to
50112 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
50113 // and-not ('andn').
50114 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
50115 return SDValue();
50116
50117 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
50118 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
50119 if (!ShiftC || !AndC)
50120 return SDValue();
50121
50122 // If we can shrink the constant mask below 8-bits or 32-bits, then this
50123 // transform should reduce code size. It may also enable secondary transforms
50124 // from improved known-bits analysis or instruction selection.
50125 APInt MaskVal = AndC->getAPIntValue();
50126
50127 // If this can be matched by a zero extend, don't optimize.
50128 if (MaskVal.isMask()) {
50129 unsigned TO = MaskVal.countr_one();
50130 if (TO >= 8 && isPowerOf2_32(TO))
50131 return SDValue();
50132 }
50133
50134 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
50135 unsigned OldMaskSize = MaskVal.getSignificantBits();
50136 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
50137 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
50138 (OldMaskSize > 32 && NewMaskSize <= 32)) {
50139 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
50140 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
50141 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
50142 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
50143 }
50144 return SDValue();
50145}
50146
50148 const X86Subtarget &Subtarget) {
50149 unsigned Opcode = N->getOpcode();
50150 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
50151
50152 SDLoc DL(N);
50153 EVT VT = N->getValueType(0);
50154 SDValue N0 = N->getOperand(0);
50155 SDValue N1 = N->getOperand(1);
50156 EVT SrcVT = N0.getValueType();
50157
50158 SDValue BC0 =
50159 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
50160 SDValue BC1 =
50161 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
50162
50163 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
50164 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
50165 // truncation trees that help us avoid lane crossing shuffles.
50166 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
50167 // TODO: We don't handle vXf64 shuffles yet.
50168 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50169 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
50171 SmallVector<int> ShuffleMask, ScaledMask;
50172 SDValue Vec = peekThroughBitcasts(BCSrc);
50173 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
50175 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
50176 // shuffle to a v4X64 width - we can probably relax this in the future.
50177 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
50178 ShuffleOps[0].getValueType().is256BitVector() &&
50179 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
50180 SDValue Lo, Hi;
50181 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50182 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
50183 Lo = DAG.getBitcast(SrcVT, Lo);
50184 Hi = DAG.getBitcast(SrcVT, Hi);
50185 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
50186 Res = DAG.getBitcast(ShufVT, Res);
50187 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
50188 return DAG.getBitcast(VT, Res);
50189 }
50190 }
50191 }
50192 }
50193
50194 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
50195 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50196 // If either/both ops are a shuffle that can scale to v2x64,
50197 // then see if we can perform this as a v4x32 post shuffle.
50198 SmallVector<SDValue> Ops0, Ops1;
50199 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
50200 bool IsShuf0 =
50201 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50202 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50203 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50204 bool IsShuf1 =
50205 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50206 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
50207 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50208 if (IsShuf0 || IsShuf1) {
50209 if (!IsShuf0) {
50210 Ops0.assign({BC0});
50211 ScaledMask0.assign({0, 1});
50212 }
50213 if (!IsShuf1) {
50214 Ops1.assign({BC1});
50215 ScaledMask1.assign({0, 1});
50216 }
50217
50218 SDValue LHS, RHS;
50219 int PostShuffle[4] = {-1, -1, -1, -1};
50220 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
50221 if (M < 0)
50222 return true;
50223 Idx = M % 2;
50224 SDValue Src = Ops[M / 2];
50225 if (!LHS || LHS == Src) {
50226 LHS = Src;
50227 return true;
50228 }
50229 if (!RHS || RHS == Src) {
50230 Idx += 2;
50231 RHS = Src;
50232 return true;
50233 }
50234 return false;
50235 };
50236 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
50237 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
50238 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
50239 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
50240 LHS = DAG.getBitcast(SrcVT, LHS);
50241 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
50242 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50243 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
50244 Res = DAG.getBitcast(ShufVT, Res);
50245 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
50246 return DAG.getBitcast(VT, Res);
50247 }
50248 }
50249 }
50250
50251 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
50252 if (VT.is256BitVector() && Subtarget.hasInt256()) {
50253 SmallVector<int> Mask0, Mask1;
50254 SmallVector<SDValue> Ops0, Ops1;
50255 SmallVector<int, 2> ScaledMask0, ScaledMask1;
50256 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50257 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50258 !Ops0.empty() && !Ops1.empty() &&
50259 all_of(Ops0,
50260 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50261 all_of(Ops1,
50262 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50263 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50264 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
50265 SDValue Op00 = peekThroughBitcasts(Ops0.front());
50266 SDValue Op10 = peekThroughBitcasts(Ops1.front());
50267 SDValue Op01 = peekThroughBitcasts(Ops0.back());
50268 SDValue Op11 = peekThroughBitcasts(Ops1.back());
50269 if ((Op00 == Op11) && (Op01 == Op10)) {
50270 std::swap(Op10, Op11);
50272 }
50273 if ((Op00 == Op10) && (Op01 == Op11)) {
50274 const int Map[4] = {0, 2, 1, 3};
50275 SmallVector<int, 4> ShuffleMask(
50276 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
50277 Map[ScaledMask1[1]]});
50278 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
50279 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
50280 DAG.getBitcast(SrcVT, Op01));
50281 Res = DAG.getBitcast(ShufVT, Res);
50282 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
50283 return DAG.getBitcast(VT, Res);
50284 }
50285 }
50286 }
50287
50288 return SDValue();
50289}
50290
50293 const X86Subtarget &Subtarget) {
50294 unsigned Opcode = N->getOpcode();
50295 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
50296 "Unexpected pack opcode");
50297
50298 EVT VT = N->getValueType(0);
50299 SDValue N0 = N->getOperand(0);
50300 SDValue N1 = N->getOperand(1);
50301 unsigned NumDstElts = VT.getVectorNumElements();
50302 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
50303 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
50304 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
50305 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
50306 "Unexpected PACKSS/PACKUS input type");
50307
50308 bool IsSigned = (X86ISD::PACKSS == Opcode);
50309
50310 // Constant Folding.
50311 APInt UndefElts0, UndefElts1;
50312 SmallVector<APInt, 32> EltBits0, EltBits1;
50313 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
50314 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
50315 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0,
50316 /*AllowWholeUndefs*/ true,
50317 /*AllowPartialUndefs*/ true) &&
50318 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1,
50319 /*AllowWholeUndefs*/ true,
50320 /*AllowPartialUndefs*/ true)) {
50321 unsigned NumLanes = VT.getSizeInBits() / 128;
50322 unsigned NumSrcElts = NumDstElts / 2;
50323 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
50324 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
50325
50326 APInt Undefs(NumDstElts, 0);
50327 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
50328 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
50329 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
50330 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
50331 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
50332 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
50333
50334 if (UndefElts[SrcIdx]) {
50335 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
50336 continue;
50337 }
50338
50339 APInt &Val = EltBits[SrcIdx];
50340 if (IsSigned) {
50341 // PACKSS: Truncate signed value with signed saturation.
50342 // Source values less than dst minint are saturated to minint.
50343 // Source values greater than dst maxint are saturated to maxint.
50344 Val = Val.truncSSat(DstBitsPerElt);
50345 } else {
50346 // PACKUS: Truncate signed value with unsigned saturation.
50347 // Source values less than zero are saturated to zero.
50348 // Source values greater than dst maxuint are saturated to maxuint.
50349 // NOTE: This is different from APInt::truncUSat.
50350 if (Val.isIntN(DstBitsPerElt))
50351 Val = Val.trunc(DstBitsPerElt);
50352 else if (Val.isNegative())
50353 Val = APInt::getZero(DstBitsPerElt);
50354 else
50355 Val = APInt::getAllOnes(DstBitsPerElt);
50356 }
50357 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
50358 }
50359 }
50360
50361 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
50362 }
50363
50364 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
50365 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50366 return V;
50367
50368 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
50369 // Currently limit this to allsignbits cases only.
50370 if (IsSigned &&
50371 (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&
50372 (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {
50373 SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);
50374 SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);
50375 if (Not0 && Not1) {
50376 SDLoc DL(N);
50377 MVT SrcVT = N0.getSimpleValueType();
50378 SDValue Pack =
50379 DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),
50380 DAG.getBitcast(SrcVT, Not1));
50381 return DAG.getNOT(DL, Pack, VT);
50382 }
50383 }
50384
50385 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
50386 // truncate to create a larger truncate.
50387 if (Subtarget.hasAVX512() &&
50388 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
50389 N0.getOperand(0).getValueType() == MVT::v8i32) {
50390 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
50391 (!IsSigned &&
50392 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
50393 if (Subtarget.hasVLX())
50394 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
50395
50396 // Widen input to v16i32 so we can truncate that.
50397 SDLoc dl(N);
50398 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
50399 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
50400 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
50401 }
50402 }
50403
50404 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
50405 if (VT.is128BitVector()) {
50406 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
50407 SDValue Src0, Src1;
50408 if (N0.getOpcode() == ExtOpc &&
50410 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50411 Src0 = N0.getOperand(0);
50412 }
50413 if (N1.getOpcode() == ExtOpc &&
50415 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50416 Src1 = N1.getOperand(0);
50417 }
50418 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
50419 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
50420 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
50421 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
50422 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
50423 }
50424
50425 // Try again with pack(*_extend_vector_inreg, undef).
50426 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
50428 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
50429 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
50430 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
50431 DAG);
50432 }
50433
50434 // Attempt to combine as shuffle.
50435 SDValue Op(N, 0);
50436 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50437 return Res;
50438
50439 return SDValue();
50440}
50441
50444 const X86Subtarget &Subtarget) {
50445 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
50446 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
50447 "Unexpected horizontal add/sub opcode");
50448
50449 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
50450 MVT VT = N->getSimpleValueType(0);
50451 SDValue LHS = N->getOperand(0);
50452 SDValue RHS = N->getOperand(1);
50453
50454 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
50455 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
50456 LHS.getOpcode() == RHS.getOpcode() &&
50457 LHS.getValueType() == RHS.getValueType() &&
50458 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
50459 SDValue LHS0 = LHS.getOperand(0);
50460 SDValue LHS1 = LHS.getOperand(1);
50461 SDValue RHS0 = RHS.getOperand(0);
50462 SDValue RHS1 = RHS.getOperand(1);
50463 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
50464 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
50465 SDLoc DL(N);
50466 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
50467 LHS0.isUndef() ? LHS1 : LHS0,
50468 RHS0.isUndef() ? RHS1 : RHS0);
50469 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
50470 Res = DAG.getBitcast(ShufVT, Res);
50471 SDValue NewLHS =
50472 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50473 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
50474 SDValue NewRHS =
50475 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50476 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
50477 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
50478 DAG.getBitcast(VT, NewRHS));
50479 }
50480 }
50481 }
50482
50483 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
50484 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50485 return V;
50486
50487 return SDValue();
50488}
50489
50492 const X86Subtarget &Subtarget) {
50493 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
50494 X86ISD::VSRL == N->getOpcode()) &&
50495 "Unexpected shift opcode");
50496 EVT VT = N->getValueType(0);
50497 SDValue N0 = N->getOperand(0);
50498 SDValue N1 = N->getOperand(1);
50499
50500 // Shift zero -> zero.
50502 return DAG.getConstant(0, SDLoc(N), VT);
50503
50504 // Detect constant shift amounts.
50505 APInt UndefElts;
50506 SmallVector<APInt, 32> EltBits;
50507 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits,
50508 /*AllowWholeUndefs*/ true,
50509 /*AllowPartialUndefs*/ false)) {
50510 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
50511 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
50512 EltBits[0].getZExtValue(), DAG);
50513 }
50514
50515 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50516 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
50517 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
50518 return SDValue(N, 0);
50519
50520 return SDValue();
50521}
50522
50525 const X86Subtarget &Subtarget) {
50526 unsigned Opcode = N->getOpcode();
50527 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
50528 X86ISD::VSRLI == Opcode) &&
50529 "Unexpected shift opcode");
50530 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
50531 EVT VT = N->getValueType(0);
50532 SDValue N0 = N->getOperand(0);
50533 SDValue N1 = N->getOperand(1);
50534 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50535 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
50536 "Unexpected value type");
50537 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
50538
50539 // (shift undef, X) -> 0
50540 if (N0.isUndef())
50541 return DAG.getConstant(0, SDLoc(N), VT);
50542
50543 // Out of range logical bit shifts are guaranteed to be zero.
50544 // Out of range arithmetic bit shifts splat the sign bit.
50545 unsigned ShiftVal = N->getConstantOperandVal(1);
50546 if (ShiftVal >= NumBitsPerElt) {
50547 if (LogicalShift)
50548 return DAG.getConstant(0, SDLoc(N), VT);
50549 ShiftVal = NumBitsPerElt - 1;
50550 }
50551
50552 // (shift X, 0) -> X
50553 if (!ShiftVal)
50554 return N0;
50555
50556 // (shift 0, C) -> 0
50558 // N0 is all zeros or undef. We guarantee that the bits shifted into the
50559 // result are all zeros, not undef.
50560 return DAG.getConstant(0, SDLoc(N), VT);
50561
50562 // (VSRAI -1, C) -> -1
50563 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
50564 // N0 is all ones or undef. We guarantee that the bits shifted into the
50565 // result are all ones, not undef.
50566 return DAG.getAllOnesConstant(SDLoc(N), VT);
50567
50568 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
50569 unsigned NewShiftVal = Amt0 + Amt1;
50570 if (NewShiftVal >= NumBitsPerElt) {
50571 // Out of range logical bit shifts are guaranteed to be zero.
50572 // Out of range arithmetic bit shifts splat the sign bit.
50573 if (LogicalShift)
50574 return DAG.getConstant(0, SDLoc(N), VT);
50575 NewShiftVal = NumBitsPerElt - 1;
50576 }
50577 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
50578 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
50579 };
50580
50581 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
50582 if (Opcode == N0.getOpcode())
50583 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
50584
50585 // (shl (add X, X), C) -> (shl X, (C + 1))
50586 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
50587 N0.getOperand(0) == N0.getOperand(1))
50588 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
50589
50590 // We can decode 'whole byte' logical bit shifts as shuffles.
50591 if (LogicalShift && (ShiftVal % 8) == 0) {
50592 SDValue Op(N, 0);
50593 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50594 return Res;
50595 }
50596
50597 // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
50598 // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
50599 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
50600 // pshufd(psrad(pslld(X,31),31),0,0,2,2).
50601 if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
50602 N0.getOpcode() == X86ISD::PSHUFD &&
50603 N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
50604 N0->hasOneUse()) {
50606 if (BC.getOpcode() == X86ISD::VSHLI &&
50607 BC.getScalarValueSizeInBits() == 64 &&
50608 BC.getConstantOperandVal(1) == 63) {
50609 SDLoc DL(N);
50610 SDValue Src = BC.getOperand(0);
50611 Src = DAG.getBitcast(VT, Src);
50612 Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
50613 getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
50614 Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
50615 Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
50616 return Src;
50617 }
50618 }
50619
50620 auto TryConstantFold = [&](SDValue V) {
50621 APInt UndefElts;
50622 SmallVector<APInt, 32> EltBits;
50623 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits,
50624 /*AllowWholeUndefs*/ true,
50625 /*AllowPartialUndefs*/ true))
50626 return SDValue();
50627 assert(EltBits.size() == VT.getVectorNumElements() &&
50628 "Unexpected shift value type");
50629 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
50630 // created an undef input due to no input bits being demanded, but user
50631 // still expects 0 in other bits.
50632 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
50633 APInt &Elt = EltBits[i];
50634 if (UndefElts[i])
50635 Elt = 0;
50636 else if (X86ISD::VSHLI == Opcode)
50637 Elt <<= ShiftVal;
50638 else if (X86ISD::VSRAI == Opcode)
50639 Elt.ashrInPlace(ShiftVal);
50640 else
50641 Elt.lshrInPlace(ShiftVal);
50642 }
50643 // Reset undef elements since they were zeroed above.
50644 UndefElts = 0;
50645 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
50646 };
50647
50648 // Constant Folding.
50649 if (N->isOnlyUserOf(N0.getNode())) {
50650 if (SDValue C = TryConstantFold(N0))
50651 return C;
50652
50653 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
50654 // Don't break NOT patterns.
50656 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
50657 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
50659 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
50660 SDLoc DL(N);
50661 SDValue LHS = DAG.getNode(Opcode, DL, VT,
50662 DAG.getBitcast(VT, BC.getOperand(0)), N1);
50663 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
50664 }
50665 }
50666 }
50667
50668 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50669 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
50670 DCI))
50671 return SDValue(N, 0);
50672
50673 return SDValue();
50674}
50675
50678 const X86Subtarget &Subtarget) {
50679 EVT VT = N->getValueType(0);
50680 unsigned Opcode = N->getOpcode();
50681 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
50682 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
50683 Opcode == ISD::INSERT_VECTOR_ELT) &&
50684 "Unexpected vector insertion");
50685
50686 SDValue Vec = N->getOperand(0);
50687 SDValue Scl = N->getOperand(1);
50688 SDValue Idx = N->getOperand(2);
50689
50690 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
50691 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
50692 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
50693
50694 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
50695 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50696 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50697 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
50698 APInt::getAllOnes(NumBitsPerElt), DCI))
50699 return SDValue(N, 0);
50700 }
50701
50702 // Attempt to combine insertion patterns to a shuffle.
50703 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
50704 SDValue Op(N, 0);
50705 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50706 return Res;
50707 }
50708
50709 return SDValue();
50710}
50711
50712/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
50713/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
50714/// OR -> CMPNEQSS.
50717 const X86Subtarget &Subtarget) {
50718 unsigned opcode;
50719
50720 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
50721 // we're requiring SSE2 for both.
50722 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
50723 SDValue N0 = N->getOperand(0);
50724 SDValue N1 = N->getOperand(1);
50725 SDValue CMP0 = N0.getOperand(1);
50726 SDValue CMP1 = N1.getOperand(1);
50727 SDLoc DL(N);
50728
50729 // The SETCCs should both refer to the same CMP.
50730 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
50731 return SDValue();
50732
50733 SDValue CMP00 = CMP0->getOperand(0);
50734 SDValue CMP01 = CMP0->getOperand(1);
50735 EVT VT = CMP00.getValueType();
50736
50737 if (VT == MVT::f32 || VT == MVT::f64 ||
50738 (VT == MVT::f16 && Subtarget.hasFP16())) {
50739 bool ExpectingFlags = false;
50740 // Check for any users that want flags:
50741 for (const SDNode *U : N->users()) {
50742 if (ExpectingFlags)
50743 break;
50744
50745 switch (U->getOpcode()) {
50746 default:
50747 case ISD::BR_CC:
50748 case ISD::BRCOND:
50749 case ISD::SELECT:
50750 ExpectingFlags = true;
50751 break;
50752 case ISD::CopyToReg:
50753 case ISD::SIGN_EXTEND:
50754 case ISD::ZERO_EXTEND:
50755 case ISD::ANY_EXTEND:
50756 break;
50757 }
50758 }
50759
50760 if (!ExpectingFlags) {
50761 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
50762 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
50763
50764 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
50765 X86::CondCode tmp = cc0;
50766 cc0 = cc1;
50767 cc1 = tmp;
50768 }
50769
50770 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
50771 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
50772 // FIXME: need symbolic constants for these magic numbers.
50773 // See X86ATTInstPrinter.cpp:printSSECC().
50774 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
50775 if (Subtarget.hasAVX512()) {
50776 SDValue FSetCC =
50777 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
50778 DAG.getTargetConstant(x86cc, DL, MVT::i8));
50779 // Need to fill with zeros to ensure the bitcast will produce zeroes
50780 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
50781 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
50782 DAG.getConstant(0, DL, MVT::v16i1),
50783 FSetCC, DAG.getVectorIdxConstant(0, DL));
50784 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
50785 N->getSimpleValueType(0));
50786 }
50787 SDValue OnesOrZeroesF =
50788 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
50789 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
50790
50791 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
50792 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
50793
50794 if (is64BitFP && !Subtarget.is64Bit()) {
50795 // On a 32-bit target, we cannot bitcast the 64-bit float to a
50796 // 64-bit integer, since that's not a legal type. Since
50797 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
50798 // bits, but can do this little dance to extract the lowest 32 bits
50799 // and work with those going forward.
50800 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL,
50801 MVT::v2f64, OnesOrZeroesF);
50802 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
50803 OnesOrZeroesF =
50804 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Vector32,
50805 DAG.getVectorIdxConstant(0, DL));
50806 IntVT = MVT::i32;
50807 }
50808
50809 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
50810 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
50811 DAG.getConstant(1, DL, IntVT));
50812 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
50813 ANDed);
50814 return OneBitOfTruth;
50815 }
50816 }
50817 }
50818 }
50819 return SDValue();
50820}
50821
50822/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
50824 SelectionDAG &DAG) {
50825 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50826
50827 MVT VT = N->getSimpleValueType(0);
50828 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
50829 return SDValue();
50830
50831 SDValue X, Y;
50832 SDValue N0 = N->getOperand(0);
50833 SDValue N1 = N->getOperand(1);
50834
50835 if (SDValue Not = IsNOT(N0, DAG)) {
50836 X = Not;
50837 Y = N1;
50838 } else if (SDValue Not = IsNOT(N1, DAG)) {
50839 X = Not;
50840 Y = N0;
50841 } else
50842 return SDValue();
50843
50844 X = DAG.getBitcast(VT, X);
50845 Y = DAG.getBitcast(VT, Y);
50846 return DAG.getNode(X86ISD::ANDNP, DL, VT, X, Y);
50847}
50848
50849/// Try to fold:
50850/// and (vector_shuffle<Z,...,Z>
50851/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
50852/// ->
50853/// andnp (vector_shuffle<Z,...,Z>
50854/// (insert_vector_elt undef, X, Z), undef), Y
50856 const X86Subtarget &Subtarget) {
50857 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50858
50859 EVT VT = N->getValueType(0);
50860 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
50861 // value and require extra moves.
50862 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
50863 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
50864 return SDValue();
50865
50866 auto GetNot = [&DAG](SDValue V) {
50868 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
50869 // end-users are ISD::AND including cases
50870 // (and(extract_vector_element(SVN), Y)).
50871 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
50872 !SVN->getOperand(1).isUndef()) {
50873 return SDValue();
50874 }
50875 SDValue IVEN = SVN->getOperand(0);
50876 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
50877 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
50878 return SDValue();
50879 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
50880 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
50881 return SDValue();
50882 SDValue Src = IVEN.getOperand(1);
50883 if (SDValue Not = IsNOT(Src, DAG)) {
50884 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
50885 SDValue NotIVEN =
50887 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
50888 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
50889 SVN->getOperand(1), SVN->getMask());
50890 }
50891 return SDValue();
50892 };
50893
50894 SDValue X, Y;
50895 SDValue N0 = N->getOperand(0);
50896 SDValue N1 = N->getOperand(1);
50897 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50898
50899 if (SDValue Not = GetNot(N0)) {
50900 X = Not;
50901 Y = N1;
50902 } else if (SDValue Not = GetNot(N1)) {
50903 X = Not;
50904 Y = N0;
50905 } else
50906 return SDValue();
50907
50908 X = DAG.getBitcast(VT, X);
50909 Y = DAG.getBitcast(VT, Y);
50910 SDLoc DL(N);
50911
50912 // We do not split for SSE at all, but we need to split vectors for AVX1 and
50913 // AVX2.
50914 if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
50916 SDValue LoX, HiX;
50917 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
50918 SDValue LoY, HiY;
50919 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
50920 EVT SplitVT = LoX.getValueType();
50921 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
50922 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
50923 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
50924 }
50925
50926 if (TLI.isTypeLegal(VT))
50927 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
50928
50929 return SDValue();
50930}
50931
50932// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
50933// logical operations, like in the example below.
50934// or (and (truncate x, truncate y)),
50935// (xor (truncate z, build_vector (constants)))
50936// Given a target type \p VT, we generate
50937// or (and x, y), (xor z, zext(build_vector (constants)))
50938// given x, y and z are of type \p VT. We can do so, if operands are either
50939// truncates from VT types, the second operand is a vector of constants, can
50940// be recursively promoted or is an existing extension we can extend further.
50942 SelectionDAG &DAG,
50943 const X86Subtarget &Subtarget,
50944 unsigned Depth) {
50945 // Limit recursion to avoid excessive compile times.
50947 return SDValue();
50948
50949 if (!ISD::isBitwiseLogicOp(N.getOpcode()))
50950 return SDValue();
50951
50952 SDValue N0 = N.getOperand(0);
50953 SDValue N1 = N.getOperand(1);
50954
50955 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50956 if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
50957 return SDValue();
50958
50959 if (SDValue NN0 =
50960 PromoteMaskArithmetic(N0, DL, VT, DAG, Subtarget, Depth + 1))
50961 N0 = NN0;
50962 else {
50963 // The left side has to be a 'trunc'.
50964 bool LHSTrunc = N0.getOpcode() == ISD::TRUNCATE &&
50965 N0.getOperand(0).getValueType() == VT;
50966 if (LHSTrunc)
50967 N0 = N0.getOperand(0);
50968 else
50969 return SDValue();
50970 }
50971
50972 if (SDValue NN1 =
50973 PromoteMaskArithmetic(N1, DL, VT, DAG, Subtarget, Depth + 1))
50974 N1 = NN1;
50975 else {
50976 // The right side has to be a 'trunc', a (foldable) constant or an
50977 // existing extension we can extend further.
50978 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
50979 N1.getOperand(0).getValueType() == VT;
50980 if (RHSTrunc)
50981 N1 = N1.getOperand(0);
50982 else if (ISD::isExtVecInRegOpcode(N1.getOpcode()) && VT.is256BitVector() &&
50983 Subtarget.hasInt256() && N1.hasOneUse())
50984 N1 = DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0));
50985 else if (SDValue Cst =
50987 N1 = Cst;
50988 else
50989 return SDValue();
50990 }
50991
50992 return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
50993}
50994
50995// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
50996// register. In most cases we actually compare or select YMM-sized registers
50997// and mixing the two types creates horrible code. This method optimizes
50998// some of the transition sequences.
50999// Even with AVX-512 this is still useful for removing casts around logical
51000// operations on vXi1 mask types.
51002 SelectionDAG &DAG,
51003 const X86Subtarget &Subtarget) {
51004 EVT VT = N.getValueType();
51005 assert(VT.isVector() && "Expected vector type");
51006 assert((N.getOpcode() == ISD::ANY_EXTEND ||
51007 N.getOpcode() == ISD::ZERO_EXTEND ||
51008 N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
51009
51010 SDValue Narrow = N.getOperand(0);
51011 EVT NarrowVT = Narrow.getValueType();
51012
51013 // Generate the wide operation.
51014 SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, Subtarget, 0);
51015 if (!Op)
51016 return SDValue();
51017 switch (N.getOpcode()) {
51018 default: llvm_unreachable("Unexpected opcode");
51019 case ISD::ANY_EXTEND:
51020 return Op;
51021 case ISD::ZERO_EXTEND:
51022 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
51023 case ISD::SIGN_EXTEND:
51024 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
51025 Op, DAG.getValueType(NarrowVT));
51026 }
51027}
51028
51029static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
51030 unsigned FPOpcode;
51031 switch (Opcode) {
51032 // clang-format off
51033 default: llvm_unreachable("Unexpected input node for FP logic conversion");
51034 case ISD::AND: FPOpcode = X86ISD::FAND; break;
51035 case ISD::OR: FPOpcode = X86ISD::FOR; break;
51036 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
51037 // clang-format on
51038 }
51039 return FPOpcode;
51040}
51041
51042/// If both input operands of a logic op are being cast from floating-point
51043/// types or FP compares, try to convert this into a floating-point logic node
51044/// to avoid unnecessary moves from SSE to integer registers.
51045static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT,
51046 SDValue N0, SDValue N1,
51047 SelectionDAG &DAG,
51049 const X86Subtarget &Subtarget) {
51050 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51051 "Unexpected bit opcode");
51052
51053 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
51054 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
51055 return SDValue();
51056
51057 SDValue N00 = N0.getOperand(0);
51058 SDValue N10 = N1.getOperand(0);
51059 EVT N00Type = N00.getValueType();
51060 EVT N10Type = N10.getValueType();
51061
51062 // Ensure that both types are the same and are legal scalar fp types.
51063 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
51064 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
51065 (Subtarget.hasFP16() && N00Type == MVT::f16)))
51066 return SDValue();
51067
51068 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
51069 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(Opc);
51070 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
51071 return DAG.getBitcast(VT, FPLogic);
51072 }
51073
51074 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
51075 !N1.hasOneUse())
51076 return SDValue();
51077
51078 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
51079 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
51080
51081 // The vector ISA for FP predicates is incomplete before AVX, so converting
51082 // COMIS* to CMPS* may not be a win before AVX.
51083 if (!Subtarget.hasAVX() &&
51084 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
51085 return SDValue();
51086
51087 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
51088 // and vector logic:
51089 // logic (setcc N00, N01), (setcc N10, N11) -->
51090 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
51091 unsigned NumElts = 128 / N00Type.getSizeInBits();
51092 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
51093 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
51094 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
51095 SDValue N01 = N0.getOperand(1);
51096 SDValue N11 = N1.getOperand(1);
51097 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
51098 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
51099 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
51100 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
51101 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
51102 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
51103 SDValue Logic = DAG.getNode(Opc, DL, BoolVecVT, Setcc0, Setcc1);
51104 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
51105}
51106
51107// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
51108// to reduce XMM->GPR traffic.
51109static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0,
51110 SDValue N1, SelectionDAG &DAG) {
51111 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51112 "Unexpected bit opcode");
51113
51114 // Both operands must be single use MOVMSK.
51115 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
51116 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
51117 return SDValue();
51118
51119 SDValue Vec0 = N0.getOperand(0);
51120 SDValue Vec1 = N1.getOperand(0);
51121 EVT VecVT0 = Vec0.getValueType();
51122 EVT VecVT1 = Vec1.getValueType();
51123
51124 // Both MOVMSK operands must be from vectors of the same size and same element
51125 // size, but its OK for a fp/int diff.
51126 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
51127 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
51128 return SDValue();
51129
51130 unsigned VecOpc =
51132 SDValue Result =
51133 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
51134 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
51135}
51136
51137// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
51138// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
51139// handles in InstCombine.
51140static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT,
51141 SDValue N0, SDValue N1,
51142 SelectionDAG &DAG) {
51143 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51144 "Unexpected bit opcode");
51145
51146 // Both operands must be single use.
51147 if (!N0.hasOneUse() || !N1.hasOneUse())
51148 return SDValue();
51149
51150 // Search for matching shifts.
51153
51154 unsigned BCOpc = BC0.getOpcode();
51155 EVT BCVT = BC0.getValueType();
51156 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
51157 return SDValue();
51158
51159 switch (BCOpc) {
51160 case X86ISD::VSHLI:
51161 case X86ISD::VSRLI:
51162 case X86ISD::VSRAI: {
51163 if (BC0.getOperand(1) != BC1.getOperand(1))
51164 return SDValue();
51165 SDValue BitOp =
51166 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
51167 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
51168 return DAG.getBitcast(VT, Shift);
51169 }
51170 }
51171
51172 return SDValue();
51173}
51174
51175// Attempt to fold:
51176// BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
51177// TODO: Handle PACKUS handling.
51178static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT,
51179 SDValue N0, SDValue N1, SelectionDAG &DAG) {
51180 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51181 "Unexpected bit opcode");
51182
51183 // Both operands must be single use.
51184 if (!N0.hasOneUse() || !N1.hasOneUse())
51185 return SDValue();
51186
51187 // Search for matching packs.
51190
51191 if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
51192 return SDValue();
51193
51194 MVT DstVT = N0.getSimpleValueType();
51195 if (DstVT != N1.getSimpleValueType())
51196 return SDValue();
51197
51198 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
51199 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
51200
51201 // Limit to allsignbits packing.
51202 if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
51203 DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
51204 DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
51205 DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
51206 return SDValue();
51207
51208 SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
51209 SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
51210 return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
51211}
51212
51213/// If this is a zero/all-bits result that is bitwise-anded with a low bits
51214/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
51215/// with a shift-right to eliminate loading the vector constant mask value.
51217 SelectionDAG &DAG,
51218 const X86Subtarget &Subtarget) {
51219 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
51220 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
51221 EVT VT = Op0.getValueType();
51222 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
51223 return SDValue();
51224
51225 // Try to convert an "is positive" signbit masking operation into arithmetic
51226 // shift and "andn". This saves a materialization of a -1 vector constant.
51227 // The "is negative" variant should be handled more generally because it only
51228 // requires "and" rather than "andn":
51229 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
51230 //
51231 // This is limited to the original type to avoid producing even more bitcasts.
51232 // If the bitcasts can't be eliminated, then it is unlikely that this fold
51233 // will be profitable.
51234 if (N->getValueType(0) == VT &&
51235 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
51236 SDValue X, Y;
51237 if (Op1.getOpcode() == X86ISD::PCMPGT &&
51238 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
51239 X = Op1.getOperand(0);
51240 Y = Op0;
51241 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
51242 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
51243 X = Op0.getOperand(0);
51244 Y = Op1;
51245 }
51246 if (X && Y) {
51247 SDValue Sra =
51249 VT.getScalarSizeInBits() - 1, DAG);
51250 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
51251 }
51252 }
51253
51254 APInt SplatVal;
51255 if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())
51256 return SDValue();
51257
51258 // Don't prevent creation of ANDN.
51259 if (isBitwiseNot(Op0))
51260 return SDValue();
51261
51262 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
51263 return SDValue();
51264
51265 unsigned EltBitWidth = VT.getScalarSizeInBits();
51266 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
51267 return SDValue();
51268
51269 unsigned ShiftVal = SplatVal.countr_one();
51270 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
51271 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
51272 return DAG.getBitcast(N->getValueType(0), Shift);
51273}
51274
51275// Get the index node from the lowered DAG of a GEP IR instruction with one
51276// indexing dimension.
51278 if (Ld->isIndexed())
51279 return SDValue();
51280
51281 SDValue Base = Ld->getBasePtr();
51282 if (Base.getOpcode() != ISD::ADD)
51283 return SDValue();
51284
51285 SDValue ShiftedIndex = Base.getOperand(0);
51286 if (ShiftedIndex.getOpcode() != ISD::SHL)
51287 return SDValue();
51288
51289 return ShiftedIndex.getOperand(0);
51290}
51291
51292static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
51293 return Subtarget.hasBMI2() &&
51294 (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
51295}
51296
51297/// Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z))
51298/// This undoes the inverse fold performed in InstCombine
51300 SelectionDAG &DAG) {
51301 using namespace llvm::SDPatternMatch;
51302 MVT VT = N->getSimpleValueType(0);
51303 if (!DAG.getTargetLoweringInfo().hasAndNot(SDValue(N, 0)))
51304 return SDValue();
51305
51306 SDValue X, Y, Z;
51307 if (sd_match(N, m_And(m_Value(X),
51308 m_OneUse(m_Or(m_Value(Y), m_Not(m_Value(Z))))))) {
51309 // Don't fold if Y or Z are constants to prevent infinite loops.
51312 return DAG.getNode(
51313 ISD::AND, DL, VT, X,
51314 DAG.getNOT(
51315 DL, DAG.getNode(ISD::AND, DL, VT, DAG.getNOT(DL, Y, VT), Z), VT));
51316 }
51317
51318 return SDValue();
51319}
51320
51321// This function recognizes cases where X86 bzhi instruction can replace and
51322// 'and-load' sequence.
51323// In case of loading integer value from an array of constants which is defined
51324// as follows:
51325//
51326// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
51327//
51328// then applying a bitwise and on the result with another input.
51329// It's equivalent to performing bzhi (zero high bits) on the input, with the
51330// same index of the load.
51332 const X86Subtarget &Subtarget) {
51333 MVT VT = Node->getSimpleValueType(0);
51334 SDLoc dl(Node);
51335
51336 // Check if subtarget has BZHI instruction for the node's type
51337 if (!hasBZHI(Subtarget, VT))
51338 return SDValue();
51339
51340 // Try matching the pattern for both operands.
51341 for (unsigned i = 0; i < 2; i++) {
51342 // continue if the operand is not a load instruction
51343 auto *Ld = dyn_cast<LoadSDNode>(Node->getOperand(i));
51344 if (!Ld)
51345 continue;
51346 const Value *MemOp = Ld->getMemOperand()->getValue();
51347 if (!MemOp)
51348 continue;
51349 // Get the Node which indexes into the array.
51351 if (!Index)
51352 continue;
51353
51354 if (auto *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
51355 if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
51356 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
51357 Constant *Init = GV->getInitializer();
51358 Type *Ty = Init->getType();
51360 !Ty->getArrayElementType()->isIntegerTy() ||
51361 Ty->getArrayElementType()->getScalarSizeInBits() !=
51362 VT.getSizeInBits() ||
51363 Ty->getArrayNumElements() >
51364 Ty->getArrayElementType()->getScalarSizeInBits())
51365 continue;
51366
51367 // Check if the array's constant elements are suitable to our case.
51368 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
51369 bool ConstantsMatch = true;
51370 for (uint64_t j = 0; j < ArrayElementCount; j++) {
51371 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
51372 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
51373 ConstantsMatch = false;
51374 break;
51375 }
51376 }
51377 if (!ConstantsMatch)
51378 continue;
51379
51380 // Do the transformation (For 32-bit type):
51381 // -> (and (load arr[idx]), inp)
51382 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
51383 // that will be replaced with one bzhi instruction.
51384 SDValue Inp = Node->getOperand(i == 0 ? 1 : 0);
51385 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
51386
51387 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
51388 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
51389 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
51390
51391 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
51392 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
51393 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
51394 }
51395 }
51396 }
51397 }
51398 return SDValue();
51399}
51400
51401// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
51402// Where C is a mask containing the same number of bits as the setcc and
51403// where the setcc will freely 0 upper bits of k-register. We can replace the
51404// undef in the concat with 0s and remove the AND. This mainly helps with
51405// v2i1/v4i1 setcc being casted to scalar.
51407 const X86Subtarget &Subtarget) {
51408 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
51409
51410 EVT VT = N->getValueType(0);
51411
51412 // Make sure this is an AND with constant. We will check the value of the
51413 // constant later.
51414 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
51415 if (!C1)
51416 return SDValue();
51417
51418 // This is implied by the ConstantSDNode.
51419 assert(!VT.isVector() && "Expected scalar VT!");
51420
51421 SDValue Src = N->getOperand(0);
51422 if (!Src.hasOneUse())
51423 return SDValue();
51424
51425 // (Optionally) peek through any_extend().
51426 if (Src.getOpcode() == ISD::ANY_EXTEND) {
51427 if (!Src.getOperand(0).hasOneUse())
51428 return SDValue();
51429 Src = Src.getOperand(0);
51430 }
51431
51432 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
51433 return SDValue();
51434
51435 Src = Src.getOperand(0);
51436 EVT SrcVT = Src.getValueType();
51437
51438 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51439 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
51440 !TLI.isTypeLegal(SrcVT))
51441 return SDValue();
51442
51443 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
51444 return SDValue();
51445
51446 // We only care about the first subvector of the concat, we expect the
51447 // other subvectors to be ignored due to the AND if we make the change.
51448 SDValue SubVec = Src.getOperand(0);
51449 EVT SubVecVT = SubVec.getValueType();
51450
51451 // The RHS of the AND should be a mask with as many bits as SubVec.
51452 if (!TLI.isTypeLegal(SubVecVT) ||
51453 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
51454 return SDValue();
51455
51456 // First subvector should be a setcc with a legal result type or a
51457 // AND containing at least one setcc with a legal result type.
51458 auto IsLegalSetCC = [&](SDValue V) {
51459 if (V.getOpcode() != ISD::SETCC)
51460 return false;
51461 EVT SetccVT = V.getOperand(0).getValueType();
51462 if (!TLI.isTypeLegal(SetccVT) ||
51463 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
51464 return false;
51465 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
51466 return false;
51467 return true;
51468 };
51469 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
51470 (IsLegalSetCC(SubVec.getOperand(0)) ||
51471 IsLegalSetCC(SubVec.getOperand(1))))))
51472 return SDValue();
51473
51474 // We passed all the checks. Rebuild the concat_vectors with zeroes
51475 // and cast it back to VT.
51476 SDLoc dl(N);
51477 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
51478 DAG.getConstant(0, dl, SubVecVT));
51479 Ops[0] = SubVec;
51480 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
51481 Ops);
51482 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
51483 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
51484}
51485
51487 SDValue OpMustEq, SDValue Op, unsigned Depth) {
51488 // We don't want to go crazy with the recursion here. This isn't a super
51489 // important optimization.
51490 static constexpr unsigned kMaxDepth = 2;
51491
51492 // Only do this re-ordering if op has one use.
51493 if (!Op.hasOneUse())
51494 return SDValue();
51495
51496 SDLoc DL(Op);
51497 // If we hit another assosiative op, recurse further.
51498 if (Op.getOpcode() == Opc) {
51499 // Done recursing.
51500 if (Depth++ >= kMaxDepth)
51501 return SDValue();
51502
51503 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51504 if (SDValue R =
51505 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
51506 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
51507 Op.getOperand(1 - OpIdx));
51508
51509 } else if (Op.getOpcode() == ISD::SUB) {
51510 if (Opc == ISD::AND) {
51511 // BLSI: (and x, (sub 0, x))
51512 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
51513 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51514 }
51515 // Opc must be ISD::AND or ISD::XOR
51516 // BLSR: (and x, (sub x, 1))
51517 // BLSMSK: (xor x, (sub x, 1))
51518 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51519 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51520
51521 } else if (Op.getOpcode() == ISD::ADD) {
51522 // Opc must be ISD::AND or ISD::XOR
51523 // BLSR: (and x, (add x, -1))
51524 // BLSMSK: (xor x, (add x, -1))
51525 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51526 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51527 }
51528 return SDValue();
51529}
51530
51532 const X86Subtarget &Subtarget) {
51533 EVT VT = N->getValueType(0);
51534 // Make sure this node is a candidate for BMI instructions.
51535 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
51536 (VT != MVT::i32 && VT != MVT::i64))
51537 return SDValue();
51538
51539 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
51540
51541 // Try and match LHS and RHS.
51542 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51543 if (SDValue OpMatch =
51544 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
51545 N->getOperand(1 - OpIdx), 0))
51546 return OpMatch;
51547 return SDValue();
51548}
51549
51550/// Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
51552 SelectionDAG &DAG,
51553 const X86Subtarget &Subtarget) {
51554 using namespace llvm::SDPatternMatch;
51555
51556 EVT VT = And->getValueType(0);
51557 // Make sure this node is a candidate for BMI instructions.
51558 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
51559 return SDValue();
51560
51561 SDValue X;
51562 SDValue Y;
51565 m_Value(Y))))
51566 return SDValue();
51567
51568 SDValue BLSMSK =
51569 DAG.getNode(ISD::XOR, DL, VT, X,
51570 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getConstant(1, DL, VT)));
51571 SDValue AndN = DAG.getNode(ISD::AND, DL, VT, Y, DAG.getNOT(DL, BLSMSK, VT));
51572 return AndN;
51573}
51574
51576 SelectionDAG &DAG,
51578 const X86Subtarget &ST) {
51579 // cmp(setcc(cc, X), 0)
51580 // brcond ne
51581 // ->
51582 // X
51583 // brcond cc
51584
51585 // sub(setcc(cc, X), 1)
51586 // brcond ne
51587 // ->
51588 // X
51589 // brcond ~cc
51590 //
51591 // if only flag has users
51592
51593 SDValue SetCC = N->getOperand(0);
51594
51595 if (SetCC.getOpcode() != X86ISD::SETCC || !Flag.hasOneUse())
51596 return SDValue();
51597
51598 // Check the only user of flag is `brcond ne`.
51599 SDNode *BrCond = *Flag->user_begin();
51600 if (BrCond->getOpcode() != X86ISD::BRCOND)
51601 return SDValue();
51602 unsigned CondNo = 2;
51603 if (static_cast<X86::CondCode>(BrCond->getConstantOperandVal(CondNo)) !=
51605 return SDValue();
51606
51607 SDValue X = SetCC.getOperand(1);
51608 // sub has two results while X only have one. DAG combine assumes the value
51609 // type matches.
51610 if (N->getOpcode() == X86ISD::SUB)
51611 X = DAG.getMergeValues({N->getOperand(0), X}, SDLoc(N));
51612
51613 SDValue CCN = SetCC.getOperand(0);
51614 X86::CondCode CC =
51615 static_cast<X86::CondCode>(CCN->getAsAPIntVal().getSExtValue());
51617 // Update CC for the consumer of the flag.
51618 // The old CC is `ne`. Hence, when comparing the result with 0, we are
51619 // checking if the second condition evaluates to true. When comparing the
51620 // result with 1, we are checking uf the second condition evaluates to false.
51622 if (isNullConstant(N->getOperand(1)))
51623 Ops[CondNo] = CCN;
51624 else if (isOneConstant(N->getOperand(1)))
51625 Ops[CondNo] = DAG.getTargetConstant(OppositeCC, SDLoc(BrCond), MVT::i8);
51626 else
51627 llvm_unreachable("expect constant 0 or 1");
51628
51629 SDValue NewBrCond =
51630 DAG.getNode(X86ISD::BRCOND, SDLoc(BrCond), BrCond->getValueType(0), Ops);
51631 // Avoid self-assign error b/c CC1 can be `e/ne`.
51632 if (BrCond != NewBrCond.getNode())
51633 DCI.CombineTo(BrCond, NewBrCond);
51634 return X;
51635}
51636
51639 const X86Subtarget &ST) {
51640 // and/or(setcc(cc0, flag0), setcc(cc1, sub (X, Y)))
51641 // ->
51642 // setcc(cc1, ccmp(X, Y, ~cflags/cflags, cc0/~cc0, flag0))
51643
51644 // and/or(setcc(cc0, flag0), setcc(cc1, cmp (X, 0)))
51645 // ->
51646 // setcc(cc1, ctest(X, X, ~cflags/cflags, cc0/~cc0, flag0))
51647 //
51648 // where cflags is determined by cc1.
51649
51650 if (!ST.hasCCMP())
51651 return SDValue();
51652
51653 SDValue SetCC0 = N->getOperand(0);
51654 SDValue SetCC1 = N->getOperand(1);
51655 if (SetCC0.getOpcode() != X86ISD::SETCC ||
51656 SetCC1.getOpcode() != X86ISD::SETCC)
51657 return SDValue();
51658
51659 auto GetCombineToOpc = [&](SDValue V) -> unsigned {
51660 SDValue Op = V.getOperand(1);
51661 unsigned Opc = Op.getOpcode();
51662 if (Opc == X86ISD::SUB)
51663 return X86ISD::CCMP;
51664 if (Opc == X86ISD::CMP && isNullConstant(Op.getOperand(1)))
51665 return X86ISD::CTEST;
51666 return 0U;
51667 };
51668
51669 unsigned NewOpc = 0;
51670
51671 // AND/OR is commutable. Canonicalize the operands to make SETCC with SUB/CMP
51672 // appear on the right.
51673 if (!(NewOpc = GetCombineToOpc(SetCC1))) {
51674 std::swap(SetCC0, SetCC1);
51675 if (!(NewOpc = GetCombineToOpc(SetCC1)))
51676 return SDValue();
51677 }
51678
51679 X86::CondCode CC0 =
51680 static_cast<X86::CondCode>(SetCC0.getConstantOperandVal(0));
51681 // CCMP/CTEST is not conditional when the source condition is COND_P/COND_NP.
51682 if (CC0 == X86::COND_P || CC0 == X86::COND_NP)
51683 return SDValue();
51684
51685 bool IsOR = N->getOpcode() == ISD::OR;
51686
51687 // CMP/TEST is executed and updates the EFLAGS normally only when SrcCC
51688 // evaluates to true. So we need to inverse CC0 as SrcCC when the logic
51689 // operator is OR. Similar for CC1.
51690 SDValue SrcCC =
51692 SDLoc(SetCC0.getOperand(0)), MVT::i8)
51693 : SetCC0.getOperand(0);
51694 SDValue CC1N = SetCC1.getOperand(0);
51695 X86::CondCode CC1 =
51696 static_cast<X86::CondCode>(CC1N->getAsAPIntVal().getSExtValue());
51698 X86::CondCode CFlagsCC = IsOR ? CC1 : OppositeCC1;
51699 SDLoc DL(N);
51700 SDValue CFlags = DAG.getTargetConstant(
51701 X86::getCCMPCondFlagsFromCondCode(CFlagsCC), DL, MVT::i8);
51702 SDValue Sub = SetCC1.getOperand(1);
51703
51704 // Replace any uses of the old flag produced by SUB/CMP with the new one
51705 // produced by CCMP/CTEST.
51706 SDValue CCMP = (NewOpc == X86ISD::CCMP)
51707 ? DAG.getNode(X86ISD::CCMP, DL, MVT::i32,
51708 {Sub.getOperand(0), Sub.getOperand(1),
51709 CFlags, SrcCC, SetCC0.getOperand(1)})
51710 : DAG.getNode(X86ISD::CTEST, DL, MVT::i32,
51711 {Sub.getOperand(0), Sub.getOperand(0),
51712 CFlags, SrcCC, SetCC0.getOperand(1)});
51713
51714 return DAG.getNode(X86ISD::SETCC, DL, MVT::i8, {CC1N, CCMP});
51715}
51716
51719 const X86Subtarget &Subtarget) {
51720 using namespace SDPatternMatch;
51721
51722 SDValue N0 = N->getOperand(0);
51723 SDValue N1 = N->getOperand(1);
51724 EVT VT = N->getValueType(0);
51725 SDLoc dl(N);
51726 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51727
51728 // If this is SSE1 only convert to FAND to avoid scalarization.
51729 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
51730 return DAG.getBitcast(MVT::v4i32,
51731 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
51732 DAG.getBitcast(MVT::v4f32, N0),
51733 DAG.getBitcast(MVT::v4f32, N1)));
51734 }
51735
51736 // Use a 32-bit and+zext if upper bits known zero.
51737 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
51738 APInt HiMask = APInt::getHighBitsSet(64, 32);
51739 if (DAG.MaskedValueIsZero(N1, HiMask) ||
51740 DAG.MaskedValueIsZero(N0, HiMask)) {
51741 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
51742 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
51743 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
51744 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
51745 }
51746 }
51747
51748 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
51749 // TODO: Support multiple SrcOps.
51750 if (VT == MVT::i1) {
51752 SmallVector<APInt, 2> SrcPartials;
51753 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
51754 SrcOps.size() == 1) {
51755 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
51756 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51757 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
51758 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
51759 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
51760 if (Mask) {
51761 assert(SrcPartials[0].getBitWidth() == NumElts &&
51762 "Unexpected partial reduction mask");
51763 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
51764 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
51765 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
51766 }
51767 }
51768 }
51769
51770 // InstCombine converts:
51771 // `(-x << C0) & C1`
51772 // to
51773 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
51774 // This saves an IR instruction but on x86 the neg/shift version is preferable
51775 // so undo the transform.
51776
51777 if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
51778 // TODO: We don't actually need a splat for this, we just need the checks to
51779 // hold for each element.
51780 ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
51781 /*AllowTruncation*/ false);
51782 ConstantSDNode *N01C =
51783 isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
51784 /*AllowTruncation*/ false);
51785 if (N1C && N01C) {
51786 const APInt &MulC = N01C->getAPIntValue();
51787 const APInt &AndC = N1C->getAPIntValue();
51788 APInt MulCLowBit = MulC & (-MulC);
51789 if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
51790 (MulCLowBit + MulC).isPowerOf2()) {
51791 SDValue Neg = DAG.getNegative(N0.getOperand(0), dl, VT);
51792 int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
51793 assert(MulCLowBitLog != -1 &&
51794 "Isolated lowbit is somehow not a power of 2!");
51795 SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
51796 DAG.getConstant(MulCLowBitLog, dl, VT));
51797 return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
51798 }
51799 }
51800 }
51801
51802 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
51803 return SetCC;
51804
51805 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
51806 return V;
51807
51808 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
51809 return R;
51810
51811 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
51812 return R;
51813
51814 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
51815 return R;
51816
51817 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
51818 DAG, DCI, Subtarget))
51819 return FPLogic;
51820
51821 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
51822 return R;
51823
51824 if (DCI.isBeforeLegalizeOps())
51825 return SDValue();
51826
51827 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
51828 return R;
51829
51830 if (SDValue R = combineAndNotIntoANDNP(N, dl ,DAG))
51831 return R;
51832
51833 if (SDValue ShiftRight = combineAndMaskToShift(N, dl, DAG, Subtarget))
51834 return ShiftRight;
51835
51836 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
51837 return R;
51838
51839 if (SDValue R = combineAndNotOrIntoAndNotAnd(N, dl, DAG))
51840 return R;
51841
51842 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
51843 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
51844 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
51845 if (VT.isVector() && getTargetConstantFromNode(N1)) {
51846 unsigned Opc0 = N0.getOpcode();
51847 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
51849 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
51850 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
51851 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
51852 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
51853 }
51854 }
51855
51856 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
51857 // to make use of predicated selects.
51858 // AND(X,SEXT(SETCC())) -> SELECT(SETCC(),X,0)
51859 if (DCI.isAfterLegalizeDAG() && VT.isVector()) {
51860 SDValue X, Y;
51861 EVT CondVT = VT.changeVectorElementType(MVT::i1);
51862 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) &&
51863 (VT.is512BitVector() || Subtarget.hasVLX()) &&
51864 (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
51867 m_Value(Y), m_SpecificVT(CondVT),
51868 m_SetCC(m_Value(), m_Value(), m_Value()))))))) {
51869 return DAG.getSelect(dl, VT, Y, X,
51870 getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl));
51871 }
51872 }
51873
51874 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
51875 // avoids slow variable shift (moving shift amount to ECX etc.)
51876 if (isOneConstant(N1) && N0->hasOneUse()) {
51877 SDValue Src = N0;
51878 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
51879 Src.getOpcode() == ISD::TRUNCATE) &&
51880 Src.getOperand(0)->hasOneUse())
51881 Src = Src.getOperand(0);
51882 bool ContainsNOT = false;
51883 X86::CondCode X86CC = X86::COND_B;
51884 // Peek through AND(NOT(SRL(X,Y)),1).
51885 if (isBitwiseNot(Src)) {
51886 Src = Src.getOperand(0);
51887 X86CC = X86::COND_AE;
51888 ContainsNOT = true;
51889 }
51890 if (Src.getOpcode() == ISD::SRL &&
51891 !isa<ConstantSDNode>(Src.getOperand(1))) {
51892 SDValue BitNo = Src.getOperand(1);
51893 Src = Src.getOperand(0);
51894 // Peek through AND(SRL(NOT(X),Y),1).
51895 if (isBitwiseNot(Src)) {
51896 Src = Src.getOperand(0);
51897 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
51898 ContainsNOT = true;
51899 }
51900 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
51901 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
51902 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
51903 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
51904 }
51905 }
51906
51907 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
51908 // Attempt to recursively combine a bitmask AND with shuffles.
51909 SDValue Op(N, 0);
51910 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
51911 return Res;
51912
51913 // If either operand is a constant mask, then only the elements that aren't
51914 // zero are actually demanded by the other operand.
51915 auto GetDemandedMasks = [&](SDValue Op) {
51916 APInt UndefElts;
51917 SmallVector<APInt> EltBits;
51918 int NumElts = VT.getVectorNumElements();
51919 int EltSizeInBits = VT.getScalarSizeInBits();
51920 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
51921 APInt DemandedElts = APInt::getAllOnes(NumElts);
51922 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
51923 EltBits)) {
51924 DemandedBits.clearAllBits();
51925 DemandedElts.clearAllBits();
51926 for (int I = 0; I != NumElts; ++I) {
51927 if (UndefElts[I]) {
51928 // We can't assume an undef src element gives an undef dst - the
51929 // other src might be zero.
51930 DemandedBits.setAllBits();
51931 DemandedElts.setBit(I);
51932 } else if (!EltBits[I].isZero()) {
51933 DemandedBits |= EltBits[I];
51934 DemandedElts.setBit(I);
51935 }
51936 }
51937 }
51938 return std::make_pair(DemandedBits, DemandedElts);
51939 };
51940 APInt Bits0, Elts0;
51941 APInt Bits1, Elts1;
51942 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
51943 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
51944
51945 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
51946 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
51947 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
51948 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
51949 if (N->getOpcode() != ISD::DELETED_NODE)
51950 DCI.AddToWorklist(N);
51951 return SDValue(N, 0);
51952 }
51953
51954 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
51955 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
51956 if (NewN0 || NewN1)
51957 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
51958 NewN1 ? NewN1 : N1);
51959 }
51960
51961 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
51962 if ((VT.getScalarSizeInBits() % 8) == 0 &&
51964 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
51965 SDValue BitMask = N1;
51966 SDValue SrcVec = N0.getOperand(0);
51967 EVT SrcVecVT = SrcVec.getValueType();
51968
51969 // Check that the constant bitmask masks whole bytes.
51970 APInt UndefElts;
51971 SmallVector<APInt, 64> EltBits;
51972 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
51973 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
51974 llvm::all_of(EltBits, [](const APInt &M) {
51975 return M.isZero() || M.isAllOnes();
51976 })) {
51977 unsigned NumElts = SrcVecVT.getVectorNumElements();
51978 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
51979 unsigned Idx = N0.getConstantOperandVal(1);
51980
51981 // Create a root shuffle mask from the byte mask and the extracted index.
51982 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
51983 for (unsigned i = 0; i != Scale; ++i) {
51984 if (UndefElts[i])
51985 continue;
51986 int VecIdx = Scale * Idx + i;
51987 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
51988 }
51989
51991 {SrcVec}, 0, SrcVec.getOpcode(), SrcVec.getSimpleValueType(),
51992 ShuffleMask, {}, /*Depth=*/1, X86::MaxShuffleCombineDepth,
51993 /*AllowVariableCrossLaneMask=*/true,
51994 /*AllowVariablePerLaneMask=*/true,
51995 /*IsMaskedShuffle=*/false, DAG, SDLoc(SrcVec), Subtarget))
51996 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
51997 N0.getOperand(1));
51998 }
51999 }
52000
52001 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
52002 return R;
52003
52004 if (SDValue R = combineAndXorSubWithBMI(N, dl, DAG, Subtarget))
52005 return R;
52006
52007 return SDValue();
52008}
52009
52010// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
52012 SelectionDAG &DAG,
52013 const X86Subtarget &Subtarget) {
52014 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52015
52016 MVT VT = N->getSimpleValueType(0);
52017 unsigned EltSizeInBits = VT.getScalarSizeInBits();
52018 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
52019 return SDValue();
52020
52021 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
52022 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
52023 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
52024 return SDValue();
52025
52026 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
52027 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
52028 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
52029 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
52030 return SDValue();
52031
52032 // Attempt to extract constant byte masks.
52033 APInt UndefElts0, UndefElts1;
52034 SmallVector<APInt, 32> EltBits0, EltBits1;
52035 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
52036 /*AllowWholeUndefs*/ false,
52037 /*AllowPartialUndefs*/ false))
52038 return SDValue();
52039 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
52040 /*AllowWholeUndefs*/ false,
52041 /*AllowPartialUndefs*/ false))
52042 return SDValue();
52043
52044 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
52045 // TODO - add UNDEF elts support.
52046 if (UndefElts0[i] || UndefElts1[i])
52047 return SDValue();
52048 if (EltBits0[i] != ~EltBits1[i])
52049 return SDValue();
52050 }
52051
52052 if (useVPTERNLOG(Subtarget, VT)) {
52053 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
52054 // VPTERNLOG is only available as vXi32/64-bit types.
52055 MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64;
52056 MVT OpVT =
52057 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
52058 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
52059 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
52060 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
52061 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
52062 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
52063 DAG, Subtarget);
52064 return DAG.getBitcast(VT, Res);
52065 }
52066
52067 SDValue X = N->getOperand(0);
52068 SDValue Y =
52069 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
52070 DAG.getBitcast(VT, N1.getOperand(0)));
52071 return DAG.getNode(ISD::OR, DL, VT, X, Y);
52072}
52073
52074// Try to match OR(ANDNP(MASK,X),AND(MASK,Y)) logic pattern.
52075// TODO: Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
52076// Waiting for ANDNP combine allows other combines to happen that prevent
52077// matching.
52078static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
52079 using namespace SDPatternMatch;
52080 return sd_match(N, m_Or(m_BinOp(X86ISD::ANDNP, m_Value(Mask), m_Value(X)),
52081 m_And(m_Deferred(Mask), m_Value(Y))));
52082}
52083
52084// Try to fold:
52085// (or (and (m, y), (pandn m, x)))
52086// into:
52087// (vselect m, x, y)
52088// As a special case, try to fold:
52089// (or (and (m, (sub 0, x)), (pandn m, x)))
52090// into:
52091// (sub (xor X, M), M)
52093 SelectionDAG &DAG,
52094 const X86Subtarget &Subtarget) {
52095 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52096
52097 EVT VT = N->getValueType(0);
52098 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
52099 (VT.is256BitVector() && Subtarget.hasInt256())))
52100 return SDValue();
52101
52102 SDValue X, Y, Mask;
52103 if (!matchLogicBlend(N, X, Y, Mask))
52104 return SDValue();
52105
52106 // Validate that X, Y, and Mask are bitcasts, and see through them.
52107 Mask = peekThroughBitcasts(Mask);
52110
52111 EVT MaskVT = Mask.getValueType();
52112 unsigned EltBits = MaskVT.getScalarSizeInBits();
52113
52114 // TODO: Attempt to handle floating point cases as well?
52115 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
52116 return SDValue();
52117
52118 // Attempt to combine to conditional negate: (sub (xor X, M), M)
52119 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
52120 DAG, Subtarget))
52121 return Res;
52122
52123 // PBLENDVB is only available on SSE 4.1.
52124 if (!Subtarget.hasSSE41())
52125 return SDValue();
52126
52127 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
52128 if (Subtarget.hasVLX())
52129 return SDValue();
52130
52131 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
52132
52133 X = DAG.getBitcast(BlendVT, X);
52134 Y = DAG.getBitcast(BlendVT, Y);
52135 Mask = DAG.getBitcast(BlendVT, Mask);
52136 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
52137 return DAG.getBitcast(VT, Mask);
52138}
52139
52140// Helper function for combineOrCmpEqZeroToCtlzSrl
52141// Transforms:
52142// seteq(cmp x, 0)
52143// into:
52144// srl(ctlz x), log2(bitsize(x))
52145// Input pattern is checked by caller.
52147 SDValue Cmp = Op.getOperand(1);
52148 EVT VT = Cmp.getOperand(0).getValueType();
52149 unsigned Log2b = Log2_32(VT.getSizeInBits());
52150 SDLoc dl(Op);
52151 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
52152 // The result of the shift is true or false, and on X86, the 32-bit
52153 // encoding of shr and lzcnt is more desirable.
52154 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
52155 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
52156 DAG.getConstant(Log2b, dl, MVT::i8));
52157 return Scc;
52158}
52159
52160// Try to transform:
52161// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
52162// into:
52163// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
52164// Will also attempt to match more generic cases, eg:
52165// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
52166// Only applies if the target supports the FastLZCNT feature.
52169 const X86Subtarget &Subtarget) {
52170 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
52171 return SDValue();
52172
52173 auto isORCandidate = [](SDValue N) {
52174 return (N->getOpcode() == ISD::OR && N->hasOneUse());
52175 };
52176
52177 // Check the zero extend is extending to 32-bit or more. The code generated by
52178 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
52179 // instructions to clear the upper bits.
52180 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
52181 !isORCandidate(N->getOperand(0)))
52182 return SDValue();
52183
52184 // Check the node matches: setcc(eq, cmp 0)
52185 auto isSetCCCandidate = [](SDValue N) {
52186 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
52187 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
52188 N->getOperand(1).getOpcode() == X86ISD::CMP &&
52189 isNullConstant(N->getOperand(1).getOperand(1)) &&
52190 N->getOperand(1).getValueType().bitsGE(MVT::i32);
52191 };
52192
52193 SDNode *OR = N->getOperand(0).getNode();
52194 SDValue LHS = OR->getOperand(0);
52195 SDValue RHS = OR->getOperand(1);
52196
52197 // Save nodes matching or(or, setcc(eq, cmp 0)).
52199 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
52200 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
52201 ORNodes.push_back(OR);
52202 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
52203 LHS = OR->getOperand(0);
52204 RHS = OR->getOperand(1);
52205 }
52206
52207 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
52208 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
52209 !isORCandidate(SDValue(OR, 0)))
52210 return SDValue();
52211
52212 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
52213 // to
52214 // or(srl(ctlz),srl(ctlz)).
52215 // The dag combiner can then fold it into:
52216 // srl(or(ctlz, ctlz)).
52217 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
52218 SDValue Ret, NewRHS;
52219 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
52220 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
52221
52222 if (!Ret)
52223 return SDValue();
52224
52225 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
52226 while (!ORNodes.empty()) {
52227 OR = ORNodes.pop_back_val();
52228 LHS = OR->getOperand(0);
52229 RHS = OR->getOperand(1);
52230 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
52231 if (RHS->getOpcode() == ISD::OR)
52232 std::swap(LHS, RHS);
52233 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
52234 if (!NewRHS)
52235 return SDValue();
52236 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
52237 }
52238
52239 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
52240}
52241
52242/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52243/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52244/// with CMP+{ADC, SBB}.
52245/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
52246static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
52247 SDValue X, SDValue Y,
52248 SelectionDAG &DAG,
52249 bool ZeroSecondOpOnly = false) {
52250 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
52251 return SDValue();
52252
52253 // Look through a one-use zext.
52254 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
52255 Y = Y.getOperand(0);
52256
52257 X86::CondCode CC;
52258 SDValue EFLAGS;
52259 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
52260 CC = (X86::CondCode)Y.getConstantOperandVal(0);
52261 EFLAGS = Y.getOperand(1);
52262 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
52263 Y.hasOneUse()) {
52264 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
52265 }
52266
52267 if (!EFLAGS)
52268 return SDValue();
52269
52270 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52271 // the general case below.
52272 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
52273 if (ConstantX && !ZeroSecondOpOnly) {
52274 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
52275 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
52276 // This is a complicated way to get -1 or 0 from the carry flag:
52277 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52278 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52279 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52280 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52281 EFLAGS);
52282 }
52283
52284 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
52285 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
52286 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
52287 EFLAGS.getValueType().isInteger() &&
52288 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52289 // Swap the operands of a SUB, and we have the same pattern as above.
52290 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
52291 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
52292 SDValue NewSub = DAG.getNode(
52293 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52294 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52295 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
52296 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52297 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52298 NewEFLAGS);
52299 }
52300 }
52301 }
52302
52303 if (CC == X86::COND_B) {
52304 // X + SETB Z --> adc X, 0
52305 // X - SETB Z --> sbb X, 0
52306 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52307 DAG.getVTList(VT, MVT::i32), X,
52308 DAG.getConstant(0, DL, VT), EFLAGS);
52309 }
52310
52311 if (ZeroSecondOpOnly)
52312 return SDValue();
52313
52314 if (CC == X86::COND_A) {
52315 // Try to convert COND_A into COND_B in an attempt to facilitate
52316 // materializing "setb reg".
52317 //
52318 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
52319 // cannot take an immediate as its first operand.
52320 //
52321 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52322 EFLAGS.getValueType().isInteger() &&
52323 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52324 SDValue NewSub =
52325 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52326 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52327 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52328 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52329 DAG.getVTList(VT, MVT::i32), X,
52330 DAG.getConstant(0, DL, VT), NewEFLAGS);
52331 }
52332 }
52333
52334 if (CC == X86::COND_AE) {
52335 // X + SETAE --> sbb X, -1
52336 // X - SETAE --> adc X, -1
52337 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52338 DAG.getVTList(VT, MVT::i32), X,
52339 DAG.getAllOnesConstant(DL, VT), EFLAGS);
52340 }
52341
52342 if (CC == X86::COND_BE) {
52343 // X + SETBE --> sbb X, -1
52344 // X - SETBE --> adc X, -1
52345 // Try to convert COND_BE into COND_AE in an attempt to facilitate
52346 // materializing "setae reg".
52347 //
52348 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
52349 // cannot take an immediate as its first operand.
52350 //
52351 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52352 EFLAGS.getValueType().isInteger() &&
52353 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52354 SDValue NewSub =
52355 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52356 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52357 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52358 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52359 DAG.getVTList(VT, MVT::i32), X,
52360 DAG.getAllOnesConstant(DL, VT), NewEFLAGS);
52361 }
52362 }
52363
52364 if (CC != X86::COND_E && CC != X86::COND_NE)
52365 return SDValue();
52366
52367 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
52368 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
52369 !EFLAGS.getOperand(0).getValueType().isInteger())
52370 return SDValue();
52371
52372 SDValue Z = EFLAGS.getOperand(0);
52373 EVT ZVT = Z.getValueType();
52374
52375 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52376 // the general case below.
52377 if (ConstantX) {
52378 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
52379 // fake operands:
52380 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
52381 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
52382 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
52383 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
52384 SDValue Zero = DAG.getConstant(0, DL, ZVT);
52385 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52386 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
52387 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52388 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52389 SDValue(Neg.getNode(), 1));
52390 }
52391
52392 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
52393 // with fake operands:
52394 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
52395 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
52396 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
52397 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
52398 SDValue One = DAG.getConstant(1, DL, ZVT);
52399 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52400 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52401 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52402 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52403 Cmp1.getValue(1));
52404 }
52405 }
52406
52407 // (cmp Z, 1) sets the carry flag if Z is 0.
52408 SDValue One = DAG.getConstant(1, DL, ZVT);
52409 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52410 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52411
52412 // Add the flags type for ADC/SBB nodes.
52413 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
52414
52415 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
52416 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
52417 if (CC == X86::COND_NE)
52418 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
52419 DAG.getAllOnesConstant(DL, VT), Cmp1.getValue(1));
52420
52421 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
52422 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
52423 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
52424 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
52425}
52426
52427/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52428/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52429/// with CMP+{ADC, SBB}.
52431 SelectionDAG &DAG) {
52432 bool IsSub = N->getOpcode() == ISD::SUB;
52433 SDValue X = N->getOperand(0);
52434 SDValue Y = N->getOperand(1);
52435 EVT VT = N->getValueType(0);
52436
52437 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
52438 return ADCOrSBB;
52439
52440 // Commute and try again (negate the result for subtracts).
52441 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
52442 if (IsSub)
52443 ADCOrSBB = DAG.getNegative(ADCOrSBB, DL, VT);
52444 return ADCOrSBB;
52445 }
52446
52447 return SDValue();
52448}
52449
52450static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT,
52451 SDValue N0, SDValue N1,
52452 SelectionDAG &DAG) {
52453 assert((Opc == ISD::XOR || Opc == ISD::OR) && "Unexpected opcode");
52454
52455 // Delegate to combineAddOrSubToADCOrSBB if we have:
52456 //
52457 // (xor/or (zero_extend (setcc)) imm)
52458 //
52459 // where imm is odd if and only if we have xor, in which case the XOR/OR are
52460 // equivalent to a SUB/ADD, respectively.
52461 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
52462 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
52463 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
52464 bool IsSub = Opc == ISD::XOR;
52465 bool N1COdd = N1C->getZExtValue() & 1;
52466 if (IsSub ? N1COdd : !N1COdd)
52467 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
52468 return R;
52469 }
52470 }
52471
52472 // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
52473 if (Opc == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&
52474 N0.getOperand(0).getOpcode() == ISD::AND &&
52478 VT.getScalarSizeInBits(), /*AllowUndefs=*/true)) {
52479 return DAG.getNode(X86ISD::PCMPEQ, DL, VT, N0.getOperand(0),
52480 N0.getOperand(0).getOperand(1));
52481 }
52482
52483 return SDValue();
52484}
52485
52488 const X86Subtarget &Subtarget) {
52489 SDValue N0 = N->getOperand(0);
52490 SDValue N1 = N->getOperand(1);
52491 EVT VT = N->getValueType(0);
52492 SDLoc dl(N);
52493 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52494
52495 // If this is SSE1 only convert to FOR to avoid scalarization.
52496 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
52497 return DAG.getBitcast(MVT::v4i32,
52498 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
52499 DAG.getBitcast(MVT::v4f32, N0),
52500 DAG.getBitcast(MVT::v4f32, N1)));
52501 }
52502
52503 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
52504 // TODO: Support multiple SrcOps.
52505 if (VT == MVT::i1) {
52507 SmallVector<APInt, 2> SrcPartials;
52508 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
52509 SrcOps.size() == 1) {
52510 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
52511 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
52512 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
52513 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
52514 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
52515 if (Mask) {
52516 assert(SrcPartials[0].getBitWidth() == NumElts &&
52517 "Unexpected partial reduction mask");
52518 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
52519 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
52520 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
52521 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
52522 }
52523 }
52524 }
52525
52526 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
52527 return SetCC;
52528
52529 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
52530 return R;
52531
52532 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
52533 return R;
52534
52535 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
52536 return R;
52537
52538 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
52539 DAG, DCI, Subtarget))
52540 return FPLogic;
52541
52542 if (DCI.isBeforeLegalizeOps())
52543 return SDValue();
52544
52545 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
52546 return R;
52547
52548 if (SDValue R = canonicalizeBitSelect(N, dl, DAG, Subtarget))
52549 return R;
52550
52551 if (SDValue R = combineLogicBlendIntoPBLENDV(N, dl, DAG, Subtarget))
52552 return R;
52553
52554 // Combine `(x86isd::setcc_carry) | C` and `(0 - SetCC) | C`
52555 // into `(zext (not SetCC)) * (C + 1) - 1` if we can get a LEA out of it.
52556 if ((VT == MVT::i32 || VT == MVT::i64) && N0.hasOneUse()) {
52557 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
52558 uint64_t Val = CN->getZExtValue();
52559 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 ||
52560 Val == 8) {
52561 SDValue NotCond;
52562 if (N0.getOpcode() == X86ISD::SETCC_CARRY &&
52563 N0.getOperand(1).hasOneUse()) {
52566 NotCond = getSETCC(NewCC, N0.getOperand(1), SDLoc(N0), DAG);
52567 } else if (N0.getOpcode() == ISD::SUB &&
52568 isNullConstant(N0.getOperand(0))) {
52569 SDValue Cond = N0.getOperand(1);
52570 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
52571 Cond = Cond.getOperand(0);
52572 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
52573 X86::CondCode OldCC = (X86::CondCode)Cond.getConstantOperandVal(0);
52575 NotCond = getSETCC(NewCC, Cond.getOperand(1), SDLoc(Cond), DAG);
52576 }
52577 }
52578
52579 if (NotCond) {
52580 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
52581 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
52582 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
52583 return R;
52584 }
52585 }
52586 }
52587 }
52588
52589 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
52590 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
52591 // iff the upper elements of the non-shifted arg are zero.
52592 // KUNPCK require 16+ bool vector elements.
52593 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
52594 unsigned NumElts = VT.getVectorNumElements();
52595 unsigned HalfElts = NumElts / 2;
52596 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
52597 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
52598 N1.getConstantOperandAPInt(1) == HalfElts &&
52599 DAG.MaskedVectorIsZero(N0, UpperElts)) {
52600 return DAG.getNode(
52601 ISD::CONCAT_VECTORS, dl, VT,
52602 extractSubVector(N0, 0, DAG, dl, HalfElts),
52603 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
52604 }
52605 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
52606 N0.getConstantOperandAPInt(1) == HalfElts &&
52607 DAG.MaskedVectorIsZero(N1, UpperElts)) {
52608 return DAG.getNode(
52609 ISD::CONCAT_VECTORS, dl, VT,
52610 extractSubVector(N1, 0, DAG, dl, HalfElts),
52611 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
52612 }
52613 }
52614
52615 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
52616 // Attempt to recursively combine an OR of shuffles.
52617 SDValue Op(N, 0);
52618 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
52619 return Res;
52620
52621 // If either operand is a constant mask, then only the elements that aren't
52622 // allones are actually demanded by the other operand.
52623 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
52624 APInt UndefElts;
52625 SmallVector<APInt> EltBits;
52626 int NumElts = VT.getVectorNumElements();
52627 int EltSizeInBits = VT.getScalarSizeInBits();
52628 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
52629 return false;
52630
52631 APInt DemandedElts = APInt::getZero(NumElts);
52632 for (int I = 0; I != NumElts; ++I)
52633 if (!EltBits[I].isAllOnes())
52634 DemandedElts.setBit(I);
52635
52636 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
52637 };
52638 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
52639 if (N->getOpcode() != ISD::DELETED_NODE)
52640 DCI.AddToWorklist(N);
52641 return SDValue(N, 0);
52642 }
52643 }
52644
52645 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))
52646 return R;
52647
52648 return SDValue();
52649}
52650
52651/// Try to turn tests against the signbit in the form of:
52652/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
52653/// into:
52654/// SETGT(X, -1)
52656 SelectionDAG &DAG) {
52657 // This is only worth doing if the output type is i8 or i1.
52658 EVT ResultType = N->getValueType(0);
52659 if (ResultType != MVT::i8 && ResultType != MVT::i1)
52660 return SDValue();
52661
52662 SDValue N0 = N->getOperand(0);
52663 SDValue N1 = N->getOperand(1);
52664
52665 // We should be performing an xor against a truncated shift.
52666 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
52667 return SDValue();
52668
52669 // Make sure we are performing an xor against one.
52670 if (!isOneConstant(N1))
52671 return SDValue();
52672
52673 // SetCC on x86 zero extends so only act on this if it's a logical shift.
52674 SDValue Shift = N0.getOperand(0);
52675 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
52676 return SDValue();
52677
52678 // Make sure we are truncating from one of i16, i32 or i64.
52679 EVT ShiftTy = Shift.getValueType();
52680 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
52681 return SDValue();
52682
52683 // Make sure the shift amount extracts the sign bit.
52684 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
52685 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
52686 return SDValue();
52687
52688 // Create a greater-than comparison against -1.
52689 // N.B. Using SETGE against 0 works but we want a canonical looking
52690 // comparison, using SETGT matches up with what TranslateX86CC.
52691 SDValue ShiftOp = Shift.getOperand(0);
52692 EVT ShiftOpTy = ShiftOp.getValueType();
52693 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52694 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
52695 *DAG.getContext(), ResultType);
52696 SDValue Cond =
52697 DAG.getSetCC(DL, SetCCResultType, ShiftOp,
52698 DAG.getAllOnesConstant(DL, ShiftOpTy), ISD::SETGT);
52699 if (SetCCResultType != ResultType)
52700 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
52701 return Cond;
52702}
52703
52704/// Turn vector tests of the signbit in the form of:
52705/// xor (sra X, elt_size(X)-1), -1
52706/// into:
52707/// pcmpgt X, -1
52708///
52709/// This should be called before type legalization because the pattern may not
52710/// persist after that.
52712 const X86Subtarget &Subtarget) {
52713 EVT VT = N->getValueType(0);
52714 if (!VT.isSimple())
52715 return SDValue();
52716
52717 switch (VT.getSimpleVT().SimpleTy) {
52718 // clang-format off
52719 default: return SDValue();
52720 case MVT::v16i8:
52721 case MVT::v8i16:
52722 case MVT::v4i32:
52723 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
52724 case MVT::v32i8:
52725 case MVT::v16i16:
52726 case MVT::v8i32:
52727 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
52728 // clang-format on
52729 }
52730
52731 // There must be a shift right algebraic before the xor, and the xor must be a
52732 // 'not' operation.
52733 SDValue Shift = N->getOperand(0);
52734 SDValue Ones = N->getOperand(1);
52735 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
52737 return SDValue();
52738
52739 // The shift should be smearing the sign bit across each vector element.
52740 auto *ShiftAmt =
52741 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
52742 if (!ShiftAmt ||
52743 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
52744 return SDValue();
52745
52746 // Create a greater-than comparison against -1. We don't use the more obvious
52747 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
52748 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
52749}
52750
52751/// Detect patterns of truncation with unsigned saturation:
52752///
52753/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
52754/// Return the source value x to be truncated or SDValue() if the pattern was
52755/// not matched.
52756///
52757/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
52758/// where C1 >= 0 and C2 is unsigned max of destination type.
52759///
52760/// (truncate (smax (smin (x, C2), C1)) to dest_type)
52761/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
52762///
52763/// These two patterns are equivalent to:
52764/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
52765/// So return the smax(x, C1) value to be truncated or SDValue() if the
52766/// pattern was not matched.
52768 const SDLoc &DL) {
52769 using namespace llvm::SDPatternMatch;
52770 EVT InVT = In.getValueType();
52771
52772 // Saturation with truncation. We truncate from InVT to VT.
52774 "Unexpected types for truncate operation");
52775
52776 APInt C1, C2;
52778
52779 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
52780 // the element size of the destination type.
52781 if (sd_match(In, m_UMin(m_Value(UMin), m_ConstInt(C2))) &&
52782 C2.isMask(VT.getScalarSizeInBits()))
52783 return UMin;
52784
52785 if (sd_match(In, m_SMin(m_Value(SMin), m_ConstInt(C2))) &&
52787 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
52788 return SMin;
52789
52790 if (sd_match(In, m_SMax(m_Value(SMax), m_ConstInt(C1))) &&
52792 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) && C2.uge(C1))
52793 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
52794
52795 return SDValue();
52796}
52797
52798/// Detect patterns of truncation with signed saturation:
52799/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
52800/// signed_max_of_dest_type)) to dest_type)
52801/// or:
52802/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
52803/// signed_min_of_dest_type)) to dest_type).
52804/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
52805/// Return the source value to be truncated or SDValue() if the pattern was not
52806/// matched.
52807static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
52808 using namespace llvm::SDPatternMatch;
52809 unsigned NumDstBits = VT.getScalarSizeInBits();
52810 unsigned NumSrcBits = In.getScalarValueSizeInBits();
52811 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
52812
52813 APInt SignedMax, SignedMin;
52814 if (MatchPackUS) {
52815 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
52816 SignedMin = APInt::getZero(NumSrcBits);
52817 } else {
52818 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
52819 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
52820 }
52821
52822 SDValue SMin, SMax;
52823 if (sd_match(In, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))) &&
52824 sd_match(SMin, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))))
52825 return SMax;
52826
52827 if (sd_match(In, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))) &&
52828 sd_match(SMax, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))))
52829 return SMin;
52830
52831 return SDValue();
52832}
52833
52835 SelectionDAG &DAG,
52836 const X86Subtarget &Subtarget) {
52837 if (!Subtarget.hasSSE2() || !VT.isVector())
52838 return SDValue();
52839
52840 EVT SVT = VT.getVectorElementType();
52841 EVT InVT = In.getValueType();
52842 EVT InSVT = InVT.getVectorElementType();
52843
52844 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
52845 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
52846 // and concatenate at the same time. Then we can use a final vpmovuswb to
52847 // clip to 0-255.
52848 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
52849 InVT == MVT::v16i32 && VT == MVT::v16i8) {
52850 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52851 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
52852 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
52853 DL, DAG, Subtarget);
52854 assert(Mid && "Failed to pack!");
52855 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
52856 }
52857 }
52858
52859 // vXi32 truncate instructions are available with AVX512F.
52860 // vXi16 truncate instructions are only available with AVX512BW.
52861 // For 256-bit or smaller vectors, we require VLX.
52862 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
52863 // If the result type is 256-bits or larger and we have disable 512-bit
52864 // registers, we should go ahead and use the pack instructions if possible.
52865 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
52866 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
52867 (InVT.getSizeInBits() > 128) &&
52868 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
52869 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
52870
52871 if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
52873 (SVT == MVT::i8 || SVT == MVT::i16) &&
52874 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
52875 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52876 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
52877 if (SVT == MVT::i8 && InSVT == MVT::i32) {
52878 EVT MidVT = VT.changeVectorElementType(MVT::i16);
52879 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
52880 DAG, Subtarget);
52881 assert(Mid && "Failed to pack!");
52883 Subtarget);
52884 assert(V && "Failed to pack!");
52885 return V;
52886 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
52887 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
52888 Subtarget);
52889 }
52890 if (SDValue SSatVal = detectSSatPattern(In, VT))
52891 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
52892 Subtarget);
52893 }
52894
52895 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52896 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
52897 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
52898 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
52899 unsigned TruncOpc = 0;
52900 SDValue SatVal;
52901 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
52902 SatVal = SSatVal;
52903 TruncOpc = X86ISD::VTRUNCS;
52904 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
52905 SatVal = USatVal;
52906 TruncOpc = X86ISD::VTRUNCUS;
52907 }
52908 if (SatVal) {
52909 unsigned ResElts = VT.getVectorNumElements();
52910 // If the input type is less than 512 bits and we don't have VLX, we need
52911 // to widen to 512 bits.
52912 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
52913 unsigned NumConcats = 512 / InVT.getSizeInBits();
52914 ResElts *= NumConcats;
52915 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
52916 ConcatOps[0] = SatVal;
52917 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
52918 NumConcats * InVT.getVectorNumElements());
52919 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
52920 }
52921 // Widen the result if its narrower than 128 bits.
52922 if (ResElts * SVT.getSizeInBits() < 128)
52923 ResElts = 128 / SVT.getSizeInBits();
52924 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
52925 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
52926 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
52927 DAG.getVectorIdxConstant(0, DL));
52928 }
52929 }
52930
52931 return SDValue();
52932}
52933
52935 SelectionDAG &DAG,
52937 const X86Subtarget &Subtarget) {
52938 auto *Ld = cast<LoadSDNode>(N);
52939 EVT RegVT = Ld->getValueType(0);
52940 SDValue Ptr = Ld->getBasePtr();
52941 SDValue Chain = Ld->getChain();
52942 ISD::LoadExtType Ext = Ld->getExtensionType();
52943
52944 if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple())
52945 return SDValue();
52946
52947 if (!(RegVT.is128BitVector() || RegVT.is256BitVector()))
52948 return SDValue();
52949
52951 if (!LdC)
52952 return SDValue();
52953
52954 auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
52955 ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
52956 for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
52957 if (Undefs[I])
52958 continue;
52959 if (UserUndefs[I] || Bits[I] != UserBits[I])
52960 return false;
52961 }
52962 return true;
52963 };
52964
52965 // Look through all other loads/broadcasts in the chain for another constant
52966 // pool entry.
52967 for (SDNode *User : Chain->users()) {
52968 auto *UserLd = dyn_cast<MemSDNode>(User);
52969 if (User != N && UserLd &&
52970 (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
52971 User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
52973 UserLd->getChain() == Chain && User->hasAnyUseOfValue(0) &&
52974 User->getValueSizeInBits(0).getFixedValue() >
52975 RegVT.getFixedSizeInBits()) {
52976 EVT UserVT = User->getValueType(0);
52977 SDValue UserPtr = UserLd->getBasePtr();
52978 const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
52979
52980 // See if we are loading a constant that matches in the lower
52981 // bits of a longer constant (but from a different constant pool ptr).
52982 if (UserC && UserPtr != Ptr) {
52983 unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
52984 unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
52985 if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
52986 APInt Undefs, UserUndefs;
52987 SmallVector<APInt> Bits, UserBits;
52988 unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),
52989 UserVT.getScalarSizeInBits());
52990 if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,
52991 Bits) &&
52993 UserUndefs, UserBits)) {
52994 if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
52996 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
52997 RegVT.getSizeInBits());
52998 Extract = DAG.getBitcast(RegVT, Extract);
52999 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53000 }
53001 }
53002 }
53003 }
53004 }
53005 }
53006
53007 return SDValue();
53008}
53009
53012 const X86Subtarget &Subtarget) {
53013 auto *Ld = cast<LoadSDNode>(N);
53014 EVT RegVT = Ld->getValueType(0);
53015 EVT MemVT = Ld->getMemoryVT();
53016 SDLoc dl(Ld);
53017 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53018
53019 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
53020 // into two 16-byte operations. Also split non-temporal aligned loads on
53021 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
53022 ISD::LoadExtType Ext = Ld->getExtensionType();
53023 unsigned Fast;
53024 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
53025 Ext == ISD::NON_EXTLOAD &&
53026 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
53027 Ld->getAlign() >= Align(16)) ||
53028 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
53029 *Ld->getMemOperand(), &Fast) &&
53030 !Fast))) {
53031 unsigned NumElems = RegVT.getVectorNumElements();
53032 if (NumElems < 2)
53033 return SDValue();
53034
53035 unsigned HalfOffset = 16;
53036 SDValue Ptr1 = Ld->getBasePtr();
53037 SDValue Ptr2 =
53038 DAG.getMemBasePlusOffset(Ptr1, TypeSize::getFixed(HalfOffset), dl);
53039 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
53040 NumElems / 2);
53041 SDValue Load1 =
53042 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
53043 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53044 SDValue Load2 =
53045 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
53046 Ld->getPointerInfo().getWithOffset(HalfOffset),
53047 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53048 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
53049 Load1.getValue(1), Load2.getValue(1));
53050
53051 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
53052 return DCI.CombineTo(N, NewVec, TF, true);
53053 }
53054
53055 // Bool vector load - attempt to cast to an integer, as we have good
53056 // (vXiY *ext(vXi1 bitcast(iX))) handling.
53057 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
53058 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
53059 unsigned NumElts = RegVT.getVectorNumElements();
53060 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
53061 if (TLI.isTypeLegal(IntVT)) {
53062 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
53063 Ld->getPointerInfo(), Ld->getBaseAlign(),
53064 Ld->getMemOperand()->getFlags());
53065 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
53066 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
53067 }
53068 }
53069
53070 // If we also broadcast this vector to a wider type, then just extract the
53071 // lowest subvector.
53072 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
53073 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
53074 SDValue Ptr = Ld->getBasePtr();
53075 SDValue Chain = Ld->getChain();
53076 for (SDNode *User : Chain->users()) {
53077 auto *UserLd = dyn_cast<MemSDNode>(User);
53078 if (User != N && UserLd &&
53079 User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
53080 UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&
53081 UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&
53082 User->hasAnyUseOfValue(0) &&
53083 User->getValueSizeInBits(0).getFixedValue() >
53084 RegVT.getFixedSizeInBits()) {
53086 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
53087 RegVT.getSizeInBits());
53088 Extract = DAG.getBitcast(RegVT, Extract);
53089 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53090 }
53091 }
53092 }
53093
53094 if (SDValue V = combineConstantPoolLoads(Ld, dl, DAG, DCI, Subtarget))
53095 return V;
53096
53097 // Cast ptr32 and ptr64 pointers to the default address space before a load.
53098 unsigned AddrSpace = Ld->getAddressSpace();
53099 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53100 AddrSpace == X86AS::PTR32_UPTR) {
53101 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53102 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
53103 SDValue Cast =
53104 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
53105 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
53106 Ld->getPointerInfo(), MemVT, Ld->getBaseAlign(),
53107 Ld->getMemOperand()->getFlags());
53108 }
53109 }
53110
53111 return SDValue();
53112}
53113
53114/// If V is a build vector of boolean constants and exactly one of those
53115/// constants is true, return the operand index of that true element.
53116/// Otherwise, return -1.
53117static int getOneTrueElt(SDValue V) {
53118 // This needs to be a build vector of booleans.
53119 // TODO: Checking for the i1 type matches the IR definition for the mask,
53120 // but the mask check could be loosened to i8 or other types. That might
53121 // also require checking more than 'allOnesValue'; eg, the x86 HW
53122 // instructions only require that the MSB is set for each mask element.
53123 // The ISD::MSTORE comments/definition do not specify how the mask operand
53124 // is formatted.
53125 auto *BV = dyn_cast<BuildVectorSDNode>(V);
53126 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
53127 return -1;
53128
53129 int TrueIndex = -1;
53130 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
53131 for (unsigned i = 0; i < NumElts; ++i) {
53132 const SDValue &Op = BV->getOperand(i);
53133 if (Op.isUndef())
53134 continue;
53135 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
53136 if (!ConstNode)
53137 return -1;
53138 if (ConstNode->getAPIntValue().countr_one() >= 1) {
53139 // If we already found a one, this is too many.
53140 if (TrueIndex >= 0)
53141 return -1;
53142 TrueIndex = i;
53143 }
53144 }
53145 return TrueIndex;
53146}
53147
53148/// Given a masked memory load/store operation, return true if it has one mask
53149/// bit set. If it has one mask bit set, then also return the memory address of
53150/// the scalar element to load/store, the vector index to insert/extract that
53151/// scalar element, and the alignment for the scalar memory access.
53153 SelectionDAG &DAG, SDValue &Addr,
53154 SDValue &Index, Align &Alignment,
53155 unsigned &Offset) {
53156 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
53157 if (TrueMaskElt < 0)
53158 return false;
53159
53160 // Get the address of the one scalar element that is specified by the mask
53161 // using the appropriate offset from the base pointer.
53162 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
53163 Offset = 0;
53164 Addr = MaskedOp->getBasePtr();
53165 if (TrueMaskElt != 0) {
53166 Offset = TrueMaskElt * EltVT.getStoreSize();
53168 SDLoc(MaskedOp));
53169 }
53170
53171 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
53172 Alignment = commonAlignment(MaskedOp->getBaseAlign(), EltVT.getStoreSize());
53173 return true;
53174}
53175
53176/// If exactly one element of the mask is set for a non-extending masked load,
53177/// it is a scalar load and vector insert.
53178/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53179/// mask have already been optimized in IR, so we don't bother with those here.
53180static SDValue
53183 const X86Subtarget &Subtarget) {
53184 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53185 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53186 // However, some target hooks may need to be added to know when the transform
53187 // is profitable. Endianness would also have to be considered.
53188
53189 SDValue Addr, VecIndex;
53190 Align Alignment;
53191 unsigned Offset;
53192 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
53193 return SDValue();
53194
53195 // Load the one scalar element that is specified by the mask using the
53196 // appropriate offset from the base pointer.
53197 SDLoc DL(ML);
53198 EVT VT = ML->getValueType(0);
53199 EVT EltVT = VT.getVectorElementType();
53200
53201 EVT CastVT = VT;
53202 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53203 EltVT = MVT::f64;
53204 CastVT = VT.changeVectorElementType(EltVT);
53205 }
53206
53207 SDValue Load =
53208 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
53209 ML->getPointerInfo().getWithOffset(Offset),
53210 Alignment, ML->getMemOperand()->getFlags());
53211
53212 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
53213
53214 // Insert the loaded element into the appropriate place in the vector.
53215 SDValue Insert =
53216 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
53217 Insert = DAG.getBitcast(VT, Insert);
53218 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
53219}
53220
53221static SDValue
53224 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53225 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
53226 return SDValue();
53227
53228 SDLoc DL(ML);
53229 EVT VT = ML->getValueType(0);
53230
53231 // If we are loading the first and last elements of a vector, it is safe and
53232 // always faster to load the whole vector. Replace the masked load with a
53233 // vector load and select.
53234 unsigned NumElts = VT.getVectorNumElements();
53235 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
53236 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
53237 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
53238 if (LoadFirstElt && LoadLastElt) {
53239 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
53240 ML->getMemOperand());
53241 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
53242 ML->getPassThru());
53243 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
53244 }
53245
53246 // Convert a masked load with a constant mask into a masked load and a select.
53247 // This allows the select operation to use a faster kind of select instruction
53248 // (for example, vblendvps -> vblendps).
53249
53250 // Don't try this if the pass-through operand is already undefined. That would
53251 // cause an infinite loop because that's what we're about to create.
53252 if (ML->getPassThru().isUndef())
53253 return SDValue();
53254
53255 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
53256 return SDValue();
53257
53258 // The new masked load has an undef pass-through operand. The select uses the
53259 // original pass-through operand.
53260 SDValue NewML = DAG.getMaskedLoad(
53261 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
53262 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
53263 ML->getAddressingMode(), ML->getExtensionType());
53264 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
53265 ML->getPassThru());
53266
53267 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
53268}
53269
53272 const X86Subtarget &Subtarget) {
53273 auto *Mld = cast<MaskedLoadSDNode>(N);
53274
53275 // TODO: Expanding load with constant mask may be optimized as well.
53276 if (Mld->isExpandingLoad())
53277 return SDValue();
53278
53279 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
53280 if (SDValue ScalarLoad =
53281 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
53282 return ScalarLoad;
53283
53284 // TODO: Do some AVX512 subsets benefit from this transform?
53285 if (!Subtarget.hasAVX512())
53286 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
53287 return Blend;
53288 }
53289
53290 // If the mask value has been legalized to a non-boolean vector, try to
53291 // simplify ops leading up to it. We only demand the MSB of each lane.
53292 SDValue Mask = Mld->getMask();
53293 if (Mask.getScalarValueSizeInBits() != 1) {
53294 EVT VT = Mld->getValueType(0);
53295 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53297 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53298 if (N->getOpcode() != ISD::DELETED_NODE)
53299 DCI.AddToWorklist(N);
53300 return SDValue(N, 0);
53301 }
53302 if (SDValue NewMask =
53304 return DAG.getMaskedLoad(
53305 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
53306 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
53307 Mld->getAddressingMode(), Mld->getExtensionType());
53308 }
53309
53310 return SDValue();
53311}
53312
53313/// If exactly one element of the mask is set for a non-truncating masked store,
53314/// it is a vector extract and scalar store.
53315/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53316/// mask have already been optimized in IR, so we don't bother with those here.
53318 SelectionDAG &DAG,
53319 const X86Subtarget &Subtarget) {
53320 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53321 // However, some target hooks may need to be added to know when the transform
53322 // is profitable. Endianness would also have to be considered.
53323
53324 SDValue Addr, VecIndex;
53325 Align Alignment;
53326 unsigned Offset;
53327 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
53328 return SDValue();
53329
53330 // Extract the one scalar element that is actually being stored.
53331 SDLoc DL(MS);
53332 SDValue Value = MS->getValue();
53333 EVT VT = Value.getValueType();
53334 EVT EltVT = VT.getVectorElementType();
53335 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53336 EltVT = MVT::f64;
53337 EVT CastVT = VT.changeVectorElementType(EltVT);
53338 Value = DAG.getBitcast(CastVT, Value);
53339 }
53340 SDValue Extract =
53341 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
53342
53343 // Store that element at the appropriate offset from the base pointer.
53344 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
53346 Alignment, MS->getMemOperand()->getFlags());
53347}
53348
53351 const X86Subtarget &Subtarget) {
53353 if (Mst->isCompressingStore())
53354 return SDValue();
53355
53356 EVT VT = Mst->getValue().getValueType();
53357 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53358
53359 if (Mst->isTruncatingStore())
53360 return SDValue();
53361
53362 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
53363 return ScalarStore;
53364
53365 // If the mask value has been legalized to a non-boolean vector, try to
53366 // simplify ops leading up to it. We only demand the MSB of each lane.
53367 SDValue Mask = Mst->getMask();
53368 if (Mask.getScalarValueSizeInBits() != 1) {
53370 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53371 if (N->getOpcode() != ISD::DELETED_NODE)
53372 DCI.AddToWorklist(N);
53373 return SDValue(N, 0);
53374 }
53375 if (SDValue NewMask =
53377 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
53378 Mst->getBasePtr(), Mst->getOffset(), NewMask,
53379 Mst->getMemoryVT(), Mst->getMemOperand(),
53380 Mst->getAddressingMode());
53381 }
53382
53383 SDValue Value = Mst->getValue();
53384 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
53385 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
53386 Mst->getMemoryVT())) {
53387 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
53388 Mst->getBasePtr(), Mst->getOffset(), Mask,
53389 Mst->getMemoryVT(), Mst->getMemOperand(),
53390 Mst->getAddressingMode(), true);
53391 }
53392
53393 return SDValue();
53394}
53395
53398 const X86Subtarget &Subtarget) {
53400 EVT StVT = St->getMemoryVT();
53401 SDLoc dl(St);
53402 SDValue StoredVal = St->getValue();
53403 EVT VT = StoredVal.getValueType();
53404 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53405
53406 // Convert a store of vXi1 into a store of iX and a bitcast.
53407 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
53408 VT.getVectorElementType() == MVT::i1) {
53409
53411 StoredVal = DAG.getBitcast(NewVT, StoredVal);
53412
53413 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53414 St->getPointerInfo(), St->getBaseAlign(),
53415 St->getMemOperand()->getFlags());
53416 }
53417
53418 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
53419 // This will avoid a copy to k-register.
53420 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
53421 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
53422 StoredVal.getOperand(0).getValueType() == MVT::i8) {
53423 SDValue Val = StoredVal.getOperand(0);
53424 // We must store zeros to the unused bits.
53425 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
53426 return DAG.getStore(St->getChain(), dl, Val, St->getBasePtr(),
53427 St->getPointerInfo(), St->getBaseAlign(),
53428 St->getMemOperand()->getFlags());
53429 }
53430
53431 // Widen v2i1/v4i1 stores to v8i1.
53432 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
53433 Subtarget.hasAVX512()) {
53434 unsigned NumConcats = 8 / VT.getVectorNumElements();
53435 // We must store zeros to the unused bits.
53436 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
53437 Ops[0] = StoredVal;
53438 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
53439 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53440 St->getPointerInfo(), St->getBaseAlign(),
53441 St->getMemOperand()->getFlags());
53442 }
53443
53444 // Turn vXi1 stores of constants into a scalar store.
53445 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
53446 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
53448 // If its a v64i1 store without 64-bit support, we need two stores.
53449 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
53450 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
53451 StoredVal->ops().slice(0, 32));
53453 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
53454 StoredVal->ops().slice(32, 32));
53456
53457 SDValue Ptr0 = St->getBasePtr();
53458 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(4), dl);
53459
53460 SDValue Ch0 =
53461 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
53462 St->getBaseAlign(), St->getMemOperand()->getFlags());
53463 SDValue Ch1 = DAG.getStore(
53464 St->getChain(), dl, Hi, Ptr1, St->getPointerInfo().getWithOffset(4),
53465 St->getBaseAlign(), St->getMemOperand()->getFlags());
53466 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
53467 }
53468
53469 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
53470 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53471 St->getPointerInfo(), St->getBaseAlign(),
53472 St->getMemOperand()->getFlags());
53473 }
53474
53475 // Convert scalar fabs/fneg load-store to integer equivalents.
53476 if ((VT == MVT::f16 || VT == MVT::bf16 || VT == MVT::f32 || VT == MVT::f64) &&
53477 (StoredVal.getOpcode() == ISD::FABS ||
53478 StoredVal.getOpcode() == ISD::FNEG) &&
53479 ISD::isNormalLoad(StoredVal.getOperand(0).getNode()) &&
53480 StoredVal.hasOneUse() && StoredVal.getOperand(0).hasOneUse()) {
53481 MVT IntVT = VT.getSimpleVT().changeTypeToInteger();
53482 if (TLI.isTypeLegal(IntVT)) {
53484 unsigned SignOp = ISD::XOR;
53485 if (StoredVal.getOpcode() == ISD::FABS) {
53486 SignMask = ~SignMask;
53487 SignOp = ISD::AND;
53488 }
53489 SDValue LogicOp = DAG.getNode(
53490 SignOp, dl, IntVT, DAG.getBitcast(IntVT, StoredVal.getOperand(0)),
53491 DAG.getConstant(SignMask, dl, IntVT));
53492 return DAG.getStore(St->getChain(), dl, LogicOp, St->getBasePtr(),
53493 St->getPointerInfo(), St->getBaseAlign(),
53494 St->getMemOperand()->getFlags());
53495 }
53496 }
53497
53498 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
53499 // Sandy Bridge, perform two 16-byte stores.
53500 unsigned Fast;
53501 if (VT.is256BitVector() && StVT == VT &&
53502 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
53503 *St->getMemOperand(), &Fast) &&
53504 !Fast) {
53505 unsigned NumElems = VT.getVectorNumElements();
53506 if (NumElems < 2)
53507 return SDValue();
53508
53509 return splitVectorStore(St, DAG);
53510 }
53511
53512 // Split under-aligned vector non-temporal stores.
53513 if (St->isNonTemporal() && StVT == VT &&
53514 St->getAlign().value() < VT.getStoreSize()) {
53515 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
53516 // vectors or the legalizer can scalarize it to use MOVNTI.
53517 if (VT.is256BitVector() || VT.is512BitVector()) {
53518 unsigned NumElems = VT.getVectorNumElements();
53519 if (NumElems < 2)
53520 return SDValue();
53521 return splitVectorStore(St, DAG);
53522 }
53523
53524 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
53525 // to use MOVNTI.
53526 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
53527 MVT NTVT = Subtarget.hasSSE4A()
53528 ? MVT::v2f64
53529 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
53530 return scalarizeVectorStore(St, NTVT, DAG);
53531 }
53532 }
53533
53534 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
53535 // supported, but avx512f is by extending to v16i32 and truncating.
53536 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
53537 St->getValue().getOpcode() == ISD::TRUNCATE &&
53538 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
53539 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
53540 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
53541 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
53542 St->getValue().getOperand(0));
53543 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
53544 MVT::v16i8, St->getMemOperand());
53545 }
53546
53547 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
53548 if (!St->isTruncatingStore() &&
53549 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
53550 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
53551 StoredVal.hasOneUse() &&
53552 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
53553 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
53554 return EmitTruncSStore(IsSigned, St->getChain(),
53555 dl, StoredVal.getOperand(0), St->getBasePtr(),
53556 VT, St->getMemOperand(), DAG);
53557 }
53558
53559 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
53560 if (!St->isTruncatingStore()) {
53561 auto IsExtractedElement = [](SDValue V) {
53562 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
53563 V = V.getOperand(0);
53564 unsigned Opc = V.getOpcode();
53566 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
53567 V.getOperand(0).hasOneUse())
53568 return V.getOperand(0);
53569 return SDValue();
53570 };
53571 if (SDValue Extract = IsExtractedElement(StoredVal)) {
53572 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
53573 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
53574 SDValue Src = Trunc.getOperand(0);
53575 MVT DstVT = Trunc.getSimpleValueType();
53576 MVT SrcVT = Src.getSimpleValueType();
53577 unsigned NumSrcElts = SrcVT.getVectorNumElements();
53578 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
53579 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
53580 if (NumTruncBits == VT.getSizeInBits() &&
53581 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
53582 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
53583 TruncVT, St->getMemOperand());
53584 }
53585 }
53586 }
53587 }
53588
53589 // Optimize trunc store (of multiple scalars) to shuffle and store.
53590 // First, pack all of the elements in one place. Next, store to memory
53591 // in fewer chunks.
53592 if (St->isTruncatingStore() && VT.isVector()) {
53593 if (TLI.isTruncStoreLegal(VT, StVT)) {
53594 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
53595 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
53596 dl, Val, St->getBasePtr(),
53597 St->getMemoryVT(), St->getMemOperand(), DAG);
53598 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
53599 DAG, dl))
53600 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
53601 dl, Val, St->getBasePtr(),
53602 St->getMemoryVT(), St->getMemOperand(), DAG);
53603 }
53604
53605 return SDValue();
53606 }
53607
53608 // Cast ptr32 and ptr64 pointers to the default address space before a store.
53609 unsigned AddrSpace = St->getAddressSpace();
53610 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53611 AddrSpace == X86AS::PTR32_UPTR) {
53612 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53613 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
53614 SDValue Cast =
53615 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
53616 return DAG.getTruncStore(
53617 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
53618 St->getBaseAlign(), St->getMemOperand()->getFlags(), St->getAAInfo());
53619 }
53620 }
53621
53622 // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
53623 // store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)
53624 if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
53625 Subtarget.hasCF() && St->isSimple()) {
53626 SDValue Cmov;
53627 if (StoredVal.getOpcode() == X86ISD::CMOV)
53628 Cmov = StoredVal;
53629 else if (StoredVal.getOpcode() == ISD::TRUNCATE &&
53630 StoredVal.getOperand(0).getOpcode() == X86ISD::CMOV)
53631 Cmov = StoredVal.getOperand(0);
53632 else
53633 return SDValue();
53634
53635 auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
53636 if (!Ld || !Ld->isSimple() || Ld->getBasePtr() != St->getBasePtr())
53637 return SDValue();
53638
53639 bool InvertCC = false;
53640 SDValue V = SDValue(Ld, 0);
53641 if (V == Cmov.getOperand(1))
53642 InvertCC = true;
53643 else if (V != Cmov.getOperand(0))
53644 return SDValue();
53645
53646 SDVTList Tys = DAG.getVTList(MVT::Other);
53647 SDValue CC = Cmov.getOperand(2);
53648 SDValue Src = DAG.getAnyExtOrTrunc(Cmov.getOperand(!InvertCC), dl, VT);
53649 if (InvertCC)
53650 CC = DAG.getTargetConstant(
53653 dl, MVT::i8);
53654 SDValue Ops[] = {St->getChain(), Src, St->getBasePtr(), CC,
53655 Cmov.getOperand(3)};
53656 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, dl, Tys, Ops, VT,
53657 St->getMemOperand());
53658 }
53659
53660 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
53661 // the FP state in cases where an emms may be missing.
53662 // A preferable solution to the general problem is to figure out the right
53663 // places to insert EMMS. This qualifies as a quick hack.
53664
53665 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
53666 if (VT.getSizeInBits() != 64)
53667 return SDValue();
53668
53669 const Function &F = DAG.getMachineFunction().getFunction();
53670 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
53671 bool F64IsLegal =
53672 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
53673
53674 if (!F64IsLegal || Subtarget.is64Bit())
53675 return SDValue();
53676
53677 if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
53678 cast<LoadSDNode>(St->getValue())->isSimple() &&
53679 St->getChain().hasOneUse() && St->isSimple()) {
53680 auto *Ld = cast<LoadSDNode>(St->getValue());
53681
53682 if (!ISD::isNormalLoad(Ld))
53683 return SDValue();
53684
53685 // Avoid the transformation if there are multiple uses of the loaded value.
53686 if (!Ld->hasNUsesOfValue(1, 0))
53687 return SDValue();
53688
53689 SDLoc LdDL(Ld);
53690 SDLoc StDL(N);
53691
53692 // Remove any range metadata as we're converting to f64 load/store.
53693 Ld->getMemOperand()->clearRanges();
53694
53695 // Lower to a single movq load/store pair.
53696 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
53697 Ld->getBasePtr(), Ld->getMemOperand());
53698
53699 // Make sure new load is placed in same chain order.
53700 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
53701 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
53702 St->getMemOperand());
53703 }
53704
53705 // This is similar to the above case, but here we handle a scalar 64-bit
53706 // integer store that is extracted from a vector on a 32-bit target.
53707 // If we have SSE2, then we can treat it like a floating-point double
53708 // to get past legalization. The execution dependencies fixup pass will
53709 // choose the optimal machine instruction for the store if this really is
53710 // an integer or v2f32 rather than an f64.
53711 if (VT == MVT::i64 &&
53713 SDValue OldExtract = St->getOperand(1);
53714 SDValue ExtOp0 = OldExtract.getOperand(0);
53715 unsigned VecSize = ExtOp0.getValueSizeInBits();
53716 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
53717 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
53718 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
53719 BitCast, OldExtract.getOperand(1));
53720 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
53721 St->getPointerInfo(), St->getBaseAlign(),
53722 St->getMemOperand()->getFlags());
53723 }
53724
53725 return SDValue();
53726}
53727
53730 const X86Subtarget &Subtarget) {
53731 auto *St = cast<MemIntrinsicSDNode>(N);
53732
53733 SDValue StoredVal = N->getOperand(1);
53734 MVT VT = StoredVal.getSimpleValueType();
53735 EVT MemVT = St->getMemoryVT();
53736
53737 // Figure out which elements we demand.
53738 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
53739 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
53740
53741 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53742 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
53743 if (N->getOpcode() != ISD::DELETED_NODE)
53744 DCI.AddToWorklist(N);
53745 return SDValue(N, 0);
53746 }
53747
53748 return SDValue();
53749}
53750
53751/// Return 'true' if this vector operation is "horizontal"
53752/// and return the operands for the horizontal operation in LHS and RHS. A
53753/// horizontal operation performs the binary operation on successive elements
53754/// of its first operand, then on successive elements of its second operand,
53755/// returning the resulting values in a vector. For example, if
53756/// A = < float a0, float a1, float a2, float a3 >
53757/// and
53758/// B = < float b0, float b1, float b2, float b3 >
53759/// then the result of doing a horizontal operation on A and B is
53760/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
53761/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
53762/// A horizontal-op B, for some already available A and B, and if so then LHS is
53763/// set to A, RHS to B, and the routine returns 'true'.
53764static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
53765 SelectionDAG &DAG, const X86Subtarget &Subtarget,
53766 bool IsCommutative,
53767 SmallVectorImpl<int> &PostShuffleMask,
53768 bool ForceHorizOp) {
53769 // If either operand is undef, bail out. The binop should be simplified.
53770 if (LHS.isUndef() || RHS.isUndef())
53771 return false;
53772
53773 // Look for the following pattern:
53774 // A = < float a0, float a1, float a2, float a3 >
53775 // B = < float b0, float b1, float b2, float b3 >
53776 // and
53777 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
53778 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
53779 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
53780 // which is A horizontal-op B.
53781
53782 MVT VT = LHS.getSimpleValueType();
53783 assert((VT.is128BitVector() || VT.is256BitVector()) &&
53784 "Unsupported vector type for horizontal add/sub");
53785 unsigned NumElts = VT.getVectorNumElements();
53786
53787 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
53788 SmallVectorImpl<int> &ShuffleMask) {
53789 bool UseSubVector = false;
53790 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
53791 Op.getOperand(0).getValueType().is256BitVector() &&
53792 llvm::isNullConstant(Op.getOperand(1))) {
53793 Op = Op.getOperand(0);
53794 UseSubVector = true;
53795 }
53797 SmallVector<int, 16> SrcMask, ScaledMask;
53799 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
53800 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
53801 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
53802 })) {
53803 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
53804 if (!UseSubVector && SrcOps.size() <= 2 &&
53805 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
53806 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
53807 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
53808 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
53809 }
53810 if (UseSubVector && SrcOps.size() == 1 &&
53811 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
53812 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
53813 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
53814 ShuffleMask.assign(Mask.begin(), Mask.end());
53815 }
53816 }
53817 };
53818
53819 // View LHS in the form
53820 // LHS = VECTOR_SHUFFLE A, B, LMask
53821 // If LHS is not a shuffle, then pretend it is the identity shuffle:
53822 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
53823 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
53824 SDValue A, B;
53826 GetShuffle(LHS, A, B, LMask);
53827
53828 // Likewise, view RHS in the form
53829 // RHS = VECTOR_SHUFFLE C, D, RMask
53830 SDValue C, D;
53832 GetShuffle(RHS, C, D, RMask);
53833
53834 // At least one of the operands should be a vector shuffle.
53835 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
53836 if (NumShuffles == 0)
53837 return false;
53838
53839 if (LMask.empty()) {
53840 A = LHS;
53841 for (unsigned i = 0; i != NumElts; ++i)
53842 LMask.push_back(i);
53843 }
53844
53845 if (RMask.empty()) {
53846 C = RHS;
53847 for (unsigned i = 0; i != NumElts; ++i)
53848 RMask.push_back(i);
53849 }
53850
53851 // If we have an unary mask, ensure the other op is set to null.
53852 if (isUndefOrInRange(LMask, 0, NumElts))
53853 B = SDValue();
53854 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
53855 A = SDValue();
53856
53857 if (isUndefOrInRange(RMask, 0, NumElts))
53858 D = SDValue();
53859 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
53860 C = SDValue();
53861
53862 // If A and B occur in reverse order in RHS, then canonicalize by commuting
53863 // RHS operands and shuffle mask.
53864 if (A != C) {
53865 std::swap(C, D);
53867 }
53868 // Check that the shuffles are both shuffling the same vectors.
53869 if (!(A == C && B == D))
53870 return false;
53871
53872 PostShuffleMask.clear();
53873 PostShuffleMask.append(NumElts, SM_SentinelUndef);
53874
53875 // LHS and RHS are now:
53876 // LHS = shuffle A, B, LMask
53877 // RHS = shuffle A, B, RMask
53878 // Check that the masks correspond to performing a horizontal operation.
53879 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
53880 // so we just repeat the inner loop if this is a 256-bit op.
53881 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
53882 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
53883 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
53884 assert((NumEltsPer128BitChunk % 2 == 0) &&
53885 "Vector type should have an even number of elements in each lane");
53886 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
53887 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
53888 // Ignore undefined components.
53889 int LIdx = LMask[i + j], RIdx = RMask[i + j];
53890 if (LIdx < 0 || RIdx < 0 ||
53891 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
53892 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
53893 continue;
53894
53895 // Check that successive odd/even elements are being operated on. If not,
53896 // this is not a horizontal operation.
53897 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
53898 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
53899 return false;
53900
53901 // Compute the post-shuffle mask index based on where the element
53902 // is stored in the HOP result, and where it needs to be moved to.
53903 int Base = LIdx & ~1u;
53904 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
53905 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
53906
53907 // The low half of the 128-bit result must choose from A.
53908 // The high half of the 128-bit result must choose from B,
53909 // unless B is undef. In that case, we are always choosing from A.
53910 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
53911 Index += NumEltsPer64BitChunk;
53912 PostShuffleMask[i + j] = Index;
53913 }
53914 }
53915
53916 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
53917 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
53918
53919 bool IsIdentityPostShuffle =
53920 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
53921 if (IsIdentityPostShuffle)
53922 PostShuffleMask.clear();
53923
53924 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
53925 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
53926 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
53927 return false;
53928
53929 // If the source nodes are already used in HorizOps then always accept this.
53930 // Shuffle folding should merge these back together.
53931 auto FoundHorizUser = [&](SDNode *User) {
53932 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
53933 };
53934 ForceHorizOp =
53935 ForceHorizOp || (llvm::any_of(NewLHS->users(), FoundHorizUser) &&
53936 llvm::any_of(NewRHS->users(), FoundHorizUser));
53937
53938 // Assume a SingleSource HOP if we only shuffle one input and don't need to
53939 // shuffle the result.
53940 if (!ForceHorizOp &&
53941 !shouldUseHorizontalOp(NewLHS == NewRHS &&
53942 (NumShuffles < 2 || !IsIdentityPostShuffle),
53943 DAG, Subtarget))
53944 return false;
53945
53946 LHS = DAG.getBitcast(VT, NewLHS);
53947 RHS = DAG.getBitcast(VT, NewRHS);
53948 return true;
53949}
53950
53951// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
53953 const X86Subtarget &Subtarget) {
53954 EVT VT = N->getValueType(0);
53955 unsigned Opcode = N->getOpcode();
53956 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
53957 SmallVector<int, 8> PostShuffleMask;
53958
53959 auto MergableHorizOp = [N](unsigned HorizOpcode) {
53960 return N->hasOneUse() &&
53961 N->user_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&
53962 (N->user_begin()->getOperand(0).getOpcode() == HorizOpcode ||
53963 N->user_begin()->getOperand(1).getOpcode() == HorizOpcode);
53964 };
53965
53966 switch (Opcode) {
53967 case ISD::FADD:
53968 case ISD::FSUB:
53969 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
53970 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
53971 SDValue LHS = N->getOperand(0);
53972 SDValue RHS = N->getOperand(1);
53973 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
53974 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
53975 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
53976 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
53977 if (!PostShuffleMask.empty())
53978 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
53979 DAG.getUNDEF(VT), PostShuffleMask);
53980 return HorizBinOp;
53981 }
53982 }
53983 break;
53984 case ISD::ADD:
53985 case ISD::SUB:
53986 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
53987 VT == MVT::v16i16 || VT == MVT::v8i32)) {
53988 SDValue LHS = N->getOperand(0);
53989 SDValue RHS = N->getOperand(1);
53990 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
53991 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
53992 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
53993 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
53995 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
53996 };
53997 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
53998 {LHS, RHS}, HOpBuilder);
53999 if (!PostShuffleMask.empty())
54000 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
54001 DAG.getUNDEF(VT), PostShuffleMask);
54002 return HorizBinOp;
54003 }
54004 }
54005 break;
54006 }
54007
54008 return SDValue();
54009}
54010
54011// Try to combine the following nodes
54012// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
54013// <i32 -2147483648[float -0.000000e+00]> 0
54014// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
54015// <(load 4 from constant-pool)> t0, t29
54016// [t30: v16i32 = bitcast t27]
54017// t6: v16i32 = xor t7, t27[t30]
54018// t11: v16f32 = bitcast t6
54019// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
54020// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
54021// t22: v16f32 = bitcast t7
54022// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
54023// t24: v32f16 = bitcast t23
54025 const X86Subtarget &Subtarget) {
54026 EVT VT = N->getValueType(0);
54027 SDValue LHS = N->getOperand(0);
54028 SDValue RHS = N->getOperand(1);
54029 int CombineOpcode =
54030 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
54031 auto combineConjugation = [&](SDValue &r) {
54032 if (LHS->getOpcode() == ISD::BITCAST) {
54033 SDValue XOR = LHS.getOperand(0);
54034 if (XOR->getOpcode() == ISD::XOR) {
54035 KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
54036 if (XORRHS.isConstant()) {
54037 APInt ConjugationInt32 = APInt(32, 0x80000000);
54038 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL);
54039 if ((XORRHS.getBitWidth() == 32 &&
54040 XORRHS.getConstant() == ConjugationInt32) ||
54041 (XORRHS.getBitWidth() == 64 &&
54042 XORRHS.getConstant() == ConjugationInt64)) {
54043 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
54044 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
54045 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
54046 r = DAG.getBitcast(VT, FCMulC);
54047 return true;
54048 }
54049 }
54050 }
54051 }
54052 return false;
54053 };
54054 SDValue Res;
54055 if (combineConjugation(Res))
54056 return Res;
54057 std::swap(LHS, RHS);
54058 if (combineConjugation(Res))
54059 return Res;
54060 return Res;
54061}
54062
54063// Try to combine the following nodes:
54064// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
54066 const X86Subtarget &Subtarget) {
54067 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
54069 Flags.hasAllowContract();
54070 };
54071
54072 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
54073 return DAG.getTarget().Options.NoSignedZerosFPMath ||
54074 Flags.hasNoSignedZeros();
54075 };
54076 auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {
54077 APInt AI = APInt(32, 0x80008000);
54078 KnownBits Bits = DAG.computeKnownBits(Op);
54079 return Bits.getBitWidth() == 32 && Bits.isConstant() &&
54080 Bits.getConstant() == AI;
54081 };
54082
54083 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
54084 !AllowContract(N->getFlags()))
54085 return SDValue();
54086
54087 EVT VT = N->getValueType(0);
54088 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
54089 return SDValue();
54090
54091 SDValue LHS = N->getOperand(0);
54092 SDValue RHS = N->getOperand(1);
54093 bool IsConj;
54094 SDValue FAddOp1, MulOp0, MulOp1;
54095 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
54096 &IsVectorAllNegativeZero,
54097 &HasNoSignedZero](SDValue N) -> bool {
54098 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
54099 return false;
54100 SDValue Op0 = N.getOperand(0);
54101 unsigned Opcode = Op0.getOpcode();
54102 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
54103 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
54104 MulOp0 = Op0.getOperand(0);
54105 MulOp1 = Op0.getOperand(1);
54106 IsConj = Opcode == X86ISD::VFCMULC;
54107 return true;
54108 }
54109 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
54111 HasNoSignedZero(Op0->getFlags())) ||
54112 IsVectorAllNegativeZero(Op0->getOperand(2)))) {
54113 MulOp0 = Op0.getOperand(0);
54114 MulOp1 = Op0.getOperand(1);
54115 IsConj = Opcode == X86ISD::VFCMADDC;
54116 return true;
54117 }
54118 }
54119 return false;
54120 };
54121
54122 if (GetCFmulFrom(LHS))
54123 FAddOp1 = RHS;
54124 else if (GetCFmulFrom(RHS))
54125 FAddOp1 = LHS;
54126 else
54127 return SDValue();
54128
54129 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
54130 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
54131 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
54132 // FIXME: How do we handle when fast math flags of FADD are different from
54133 // CFMUL's?
54134 SDValue CFmul =
54135 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
54136 return DAG.getBitcast(VT, CFmul);
54137}
54138
54139/// Do target-specific dag combines on floating-point adds/subs.
54141 const X86Subtarget &Subtarget) {
54142 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
54143 return HOp;
54144
54145 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
54146 return COp;
54147
54148 return SDValue();
54149}
54150
54152 const X86Subtarget &Subtarget) {
54153 EVT VT = N->getValueType(0);
54154 SDValue Src = N->getOperand(0);
54155 EVT SrcVT = Src.getValueType();
54156 SDLoc DL(N);
54157
54158 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54159
54160 // Let legalize expand this if it isn't a legal type yet.
54161 if (!TLI.isTypeLegal(VT))
54162 return SDValue();
54163
54164 if ((SrcVT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) ||
54165 (SrcVT.getScalarType() == MVT::f32 && !Subtarget.hasDQI()))
54166 return SDValue();
54167
54168 if (SrcVT == MVT::v2f16) {
54169 SrcVT = MVT::v4f16;
54170 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54171 DAG.getUNDEF(MVT::v2f16));
54172 }
54173
54174 if (SrcVT == MVT::v4f16) {
54175 SrcVT = MVT::v8f16;
54176 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54177 DAG.getUNDEF(MVT::v4f16));
54178 } else if (SrcVT == MVT::v2f32) {
54179 SrcVT = MVT::v4f32;
54180 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54181 DAG.getUNDEF(MVT::v2f32));
54182 } else {
54183 return SDValue();
54184 }
54185
54186 return DAG.getNode(X86ISD::CVTP2SI, DL, VT, Src);
54187}
54188
54189// Attempt to fold some (truncate (srl (add/or/xor X, C1), C2)) patterns to
54190// (add/or/xor (truncate (srl X, C2)), C1'). C1' will be smaller than C1 so we
54191// are able to avoid generating code with MOVABS and large constants in certain
54192// cases.
54194 const SDLoc &DL) {
54195 assert(N.getOpcode() == ISD::SRL && "Unknown shift opcode");
54196 std::optional<unsigned> ValidSrlConst = DAG.getValidShiftAmount(N);
54197 if (!ValidSrlConst)
54198 return SDValue();
54199 unsigned SrlConstVal = *ValidSrlConst;
54200
54201 SDValue Op = N.getOperand(0);
54202 unsigned Opcode = Op.getOpcode();
54203 assert(VT == MVT::i32 && Op.getValueType() == MVT::i64 &&
54204 "Illegal truncation types");
54205
54206 if ((Opcode != ISD::ADD && Opcode != ISD::OR && Opcode != ISD::XOR) ||
54207 !isa<ConstantSDNode>(Op.getOperand(1)))
54208 return SDValue();
54209 const APInt &OpConst = Op.getConstantOperandAPInt(1);
54210
54211 if (SrlConstVal <= 32 ||
54212 (Opcode == ISD::ADD && OpConst.countr_zero() < SrlConstVal))
54213 return SDValue();
54214
54215 SDValue OpLhsSrl =
54216 DAG.getNode(ISD::SRL, DL, MVT::i64, Op.getOperand(0), N.getOperand(1));
54217 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, OpLhsSrl);
54218
54219 APInt NewOpConstVal = OpConst.lshr(SrlConstVal).trunc(VT.getSizeInBits());
54220 SDValue NewOpConst = DAG.getConstant(NewOpConstVal, DL, VT);
54221 SDValue NewOpNode = DAG.getNode(Opcode, DL, VT, Trunc, NewOpConst);
54222
54223 if (Opcode == ISD::ADD) {
54224 EVT CleanUpVT = EVT::getIntegerVT(*DAG.getContext(), 64 - SrlConstVal);
54225 return DAG.getZeroExtendInReg(NewOpNode, DL, CleanUpVT);
54226 }
54227 return NewOpNode;
54228}
54229
54230/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
54231/// the codegen.
54232/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
54233/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
54234/// anything that is guaranteed to be transformed by DAGCombiner.
54236 const X86Subtarget &Subtarget,
54237 const SDLoc &DL) {
54238 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
54239 SDValue Src = N->getOperand(0);
54240 unsigned SrcOpcode = Src.getOpcode();
54241 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54242
54243 EVT VT = N->getValueType(0);
54244 EVT SrcVT = Src.getValueType();
54245
54246 auto IsFreeTruncation = [VT](SDValue Op) {
54247 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
54248
54249 // See if this has been extended from a smaller/equal size to
54250 // the truncation size, allowing a truncation to combine with the extend.
54251 unsigned Opcode = Op.getOpcode();
54252 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
54253 Opcode == ISD::ZERO_EXTEND) &&
54254 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
54255 return true;
54256
54257 // See if this is a single use constant which can be constant folded.
54258 // NOTE: We don't peek throught bitcasts here because there is currently
54259 // no support for constant folding truncate+bitcast+vector_of_constants. So
54260 // we'll just send up with a truncate on both operands which will
54261 // get turned back into (truncate (binop)) causing an infinite loop.
54262 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54263 };
54264
54265 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
54266 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
54267 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
54268 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
54269 };
54270
54271 // Don't combine if the operation has other uses.
54272 if (!Src.hasOneUse())
54273 return SDValue();
54274
54275 if (VT == MVT::i32 && SrcVT == MVT::i64 && SrcOpcode == ISD::SRL)
54276 return combinei64TruncSrlConstant(Src, VT, DAG, DL);
54277
54278 if (!VT.isVector())
54279 return SDValue();
54280
54281 // In most cases its only worth pre-truncating if we're only facing the cost
54282 // of one truncation.
54283 // i.e. if one of the inputs will constant fold or the input is repeated.
54284 switch (SrcOpcode) {
54285 case ISD::MUL:
54286 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
54287 // better to truncate if we have the chance.
54288 if (SrcVT.getScalarType() == MVT::i64 &&
54289 TLI.isOperationLegal(SrcOpcode, VT) &&
54290 !TLI.isOperationLegal(SrcOpcode, SrcVT))
54291 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
54292 [[fallthrough]];
54293 case ISD::AND:
54294 case ISD::XOR:
54295 case ISD::OR:
54296 case ISD::ADD:
54297 case ISD::SUB: {
54298 SDValue Op0 = Src.getOperand(0);
54299 SDValue Op1 = Src.getOperand(1);
54300 if (TLI.isOperationLegal(SrcOpcode, VT) &&
54301 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
54302 return TruncateArithmetic(Op0, Op1);
54303 break;
54304 }
54305 }
54306
54307 return SDValue();
54308}
54309
54310// Try to form a MULHU or MULHS node by looking for
54311// (trunc (srl (mul ext, ext), >= 16))
54312// TODO: This is X86 specific because we want to be able to handle wide types
54313// before type legalization. But we can only do it if the vector will be
54314// legalized via widening/splitting. Type legalization can't handle promotion
54315// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
54316// combiner.
54317static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
54318 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
54319 using namespace llvm::SDPatternMatch;
54320
54321 if (!Subtarget.hasSSE2())
54322 return SDValue();
54323
54324 // Only handle vXi16 types that are at least 128-bits unless they will be
54325 // widened.
54326 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
54327 return SDValue();
54328
54329 // Input type should be at least vXi32.
54330 EVT InVT = Src.getValueType();
54331 if (InVT.getVectorElementType().getSizeInBits() < 32)
54332 return SDValue();
54333
54334 // First instruction should be a right shift by 16 of a multiply.
54335 SDValue LHS, RHS;
54336 APInt ShiftAmt;
54337 if (!sd_match(Src,
54338 m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_ConstInt(ShiftAmt))))
54339 return SDValue();
54340
54341 if (ShiftAmt.ult(16) || ShiftAmt.uge(InVT.getScalarSizeInBits()))
54342 return SDValue();
54343
54344 uint64_t AdditionalShift = ShiftAmt.getZExtValue() - 16;
54345
54346 // Count leading sign/zero bits on both inputs - if there are enough then
54347 // truncation back to vXi16 will be cheap - either as a pack/shuffle
54348 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
54349 // truncations may actually be free by peeking through to the ext source.
54350 auto IsSext = [&DAG](SDValue V) {
54351 return DAG.ComputeMaxSignificantBits(V) <= 16;
54352 };
54353 auto IsZext = [&DAG](SDValue V) {
54354 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
54355 };
54356
54357 bool IsSigned = IsSext(LHS) && IsSext(RHS);
54358 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
54359 if (!IsSigned && !IsUnsigned)
54360 return SDValue();
54361
54362 // Check if both inputs are extensions, which will be removed by truncation.
54363 auto isOpTruncateFree = [](SDValue Op) {
54364 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
54365 Op.getOpcode() == ISD::ZERO_EXTEND)
54366 return Op.getOperand(0).getScalarValueSizeInBits() <= 16;
54367 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54368 };
54369 bool IsTruncateFree = isOpTruncateFree(LHS) && isOpTruncateFree(RHS);
54370
54371 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
54372 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
54373 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
54374 // will have to split anyway.
54375 unsigned InSizeInBits = InVT.getSizeInBits();
54376 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
54377 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
54378 (InSizeInBits % 16) == 0) {
54379 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54380 InVT.getSizeInBits() / 16);
54381 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
54382 DAG.getBitcast(BCVT, RHS));
54383 Res = DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
54384 return DAG.getNode(ISD::SRL, DL, VT, Res,
54385 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54386 }
54387
54388 // Truncate back to source type.
54389 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
54390 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
54391
54392 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
54393 SDValue Res = DAG.getNode(Opc, DL, VT, LHS, RHS);
54394 return DAG.getNode(ISD::SRL, DL, VT, Res,
54395 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54396}
54397
54398// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
54399// from one vector with signed bytes from another vector, adds together
54400// adjacent pairs of 16-bit products, and saturates the result before
54401// truncating to 16-bits.
54402//
54403// Which looks something like this:
54404// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
54405// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
54407 const X86Subtarget &Subtarget,
54408 const SDLoc &DL) {
54409 if (!VT.isVector() || !Subtarget.hasSSSE3())
54410 return SDValue();
54411
54412 unsigned NumElems = VT.getVectorNumElements();
54413 EVT ScalarVT = VT.getVectorElementType();
54414 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
54415 return SDValue();
54416
54417 SDValue SSatVal = detectSSatPattern(In, VT);
54418 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
54419 return SDValue();
54420
54421 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
54422 // of multiplies from even/odd elements.
54423 SDValue N0 = SSatVal.getOperand(0);
54424 SDValue N1 = SSatVal.getOperand(1);
54425
54426 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
54427 return SDValue();
54428
54429 SDValue N00 = N0.getOperand(0);
54430 SDValue N01 = N0.getOperand(1);
54431 SDValue N10 = N1.getOperand(0);
54432 SDValue N11 = N1.getOperand(1);
54433
54434 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
54435 // Canonicalize zero_extend to LHS.
54436 if (N01.getOpcode() == ISD::ZERO_EXTEND)
54437 std::swap(N00, N01);
54438 if (N11.getOpcode() == ISD::ZERO_EXTEND)
54439 std::swap(N10, N11);
54440
54441 // Ensure we have a zero_extend and a sign_extend.
54442 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
54443 N01.getOpcode() != ISD::SIGN_EXTEND ||
54444 N10.getOpcode() != ISD::ZERO_EXTEND ||
54445 N11.getOpcode() != ISD::SIGN_EXTEND)
54446 return SDValue();
54447
54448 // Peek through the extends.
54449 N00 = N00.getOperand(0);
54450 N01 = N01.getOperand(0);
54451 N10 = N10.getOperand(0);
54452 N11 = N11.getOperand(0);
54453
54454 // Ensure the extend is from vXi8.
54455 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
54456 N01.getValueType().getVectorElementType() != MVT::i8 ||
54457 N10.getValueType().getVectorElementType() != MVT::i8 ||
54458 N11.getValueType().getVectorElementType() != MVT::i8)
54459 return SDValue();
54460
54461 // All inputs should be build_vectors.
54462 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
54463 N01.getOpcode() != ISD::BUILD_VECTOR ||
54464 N10.getOpcode() != ISD::BUILD_VECTOR ||
54466 return SDValue();
54467
54468 // N00/N10 are zero extended. N01/N11 are sign extended.
54469
54470 // For each element, we need to ensure we have an odd element from one vector
54471 // multiplied by the odd element of another vector and the even element from
54472 // one of the same vectors being multiplied by the even element from the
54473 // other vector. So we need to make sure for each element i, this operator
54474 // is being performed:
54475 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
54476 SDValue ZExtIn, SExtIn;
54477 for (unsigned i = 0; i != NumElems; ++i) {
54478 SDValue N00Elt = N00.getOperand(i);
54479 SDValue N01Elt = N01.getOperand(i);
54480 SDValue N10Elt = N10.getOperand(i);
54481 SDValue N11Elt = N11.getOperand(i);
54482 // TODO: Be more tolerant to undefs.
54483 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54484 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54485 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54487 return SDValue();
54488 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
54489 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
54490 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
54491 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
54492 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
54493 return SDValue();
54494 unsigned IdxN00 = ConstN00Elt->getZExtValue();
54495 unsigned IdxN01 = ConstN01Elt->getZExtValue();
54496 unsigned IdxN10 = ConstN10Elt->getZExtValue();
54497 unsigned IdxN11 = ConstN11Elt->getZExtValue();
54498 // Add is commutative so indices can be reordered.
54499 if (IdxN00 > IdxN10) {
54500 std::swap(IdxN00, IdxN10);
54501 std::swap(IdxN01, IdxN11);
54502 }
54503 // N0 indices be the even element. N1 indices must be the next odd element.
54504 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
54505 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
54506 return SDValue();
54507 SDValue N00In = N00Elt.getOperand(0);
54508 SDValue N01In = N01Elt.getOperand(0);
54509 SDValue N10In = N10Elt.getOperand(0);
54510 SDValue N11In = N11Elt.getOperand(0);
54511 // First time we find an input capture it.
54512 if (!ZExtIn) {
54513 ZExtIn = N00In;
54514 SExtIn = N01In;
54515 }
54516 if (ZExtIn != N00In || SExtIn != N01In ||
54517 ZExtIn != N10In || SExtIn != N11In)
54518 return SDValue();
54519 }
54520
54521 auto ExtractVec = [&DAG, &DL, NumElems](SDValue &Ext) {
54522 EVT ExtVT = Ext.getValueType();
54523 if (ExtVT.getVectorNumElements() != NumElems * 2) {
54524 MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2);
54525 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext,
54526 DAG.getVectorIdxConstant(0, DL));
54527 }
54528 };
54529 ExtractVec(ZExtIn);
54530 ExtractVec(SExtIn);
54531
54532 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
54534 // Shrink by adding truncate nodes and let DAGCombine fold with the
54535 // sources.
54536 EVT InVT = Ops[0].getValueType();
54537 assert(InVT.getScalarType() == MVT::i8 &&
54538 "Unexpected scalar element type");
54539 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
54540 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54541 InVT.getVectorNumElements() / 2);
54542 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
54543 };
54544 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
54545 PMADDBuilder);
54546}
54547
54549 const X86Subtarget &Subtarget) {
54550 EVT VT = N->getValueType(0);
54551 SDValue Src = N->getOperand(0);
54552 SDLoc DL(N);
54553
54554 // Attempt to pre-truncate inputs to arithmetic ops instead.
54555 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
54556 return V;
54557
54558 // Try to detect PMADD
54559 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
54560 return PMAdd;
54561
54562 // Try to combine truncation with signed/unsigned saturation.
54563 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
54564 return Val;
54565
54566 // Try to combine PMULHUW/PMULHW for vXi16.
54567 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
54568 return V;
54569
54570 // The bitcast source is a direct mmx result.
54571 // Detect bitcasts between i32 to x86mmx
54572 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
54573 SDValue BCSrc = Src.getOperand(0);
54574 if (BCSrc.getValueType() == MVT::x86mmx)
54575 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
54576 }
54577
54578 return SDValue();
54579}
54580
54583 EVT VT = N->getValueType(0);
54584 SDValue In = N->getOperand(0);
54585 SDLoc DL(N);
54586
54587 if (SDValue SSatVal = detectSSatPattern(In, VT))
54588 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
54589 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
54590 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
54591
54592 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54593 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
54594 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54595 return SDValue(N, 0);
54596
54597 return SDValue();
54598}
54599
54600/// Returns the negated value if the node \p N flips sign of FP value.
54601///
54602/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
54603/// or FSUB(0, x)
54604/// AVX512F does not have FXOR, so FNEG is lowered as
54605/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
54606/// In this case we go though all bitcasts.
54607/// This also recognizes splat of a negated value and returns the splat of that
54608/// value.
54609static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
54610 if (N->getOpcode() == ISD::FNEG)
54611 return N->getOperand(0);
54612
54613 // Don't recurse exponentially.
54615 return SDValue();
54616
54617 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
54618
54620 EVT VT = Op->getValueType(0);
54621
54622 // Make sure the element size doesn't change.
54623 if (VT.getScalarSizeInBits() != ScalarSize)
54624 return SDValue();
54625
54626 unsigned Opc = Op.getOpcode();
54627 switch (Opc) {
54628 case ISD::VECTOR_SHUFFLE: {
54629 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
54630 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
54631 if (!Op.getOperand(1).isUndef())
54632 return SDValue();
54633 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
54634 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
54635 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
54636 cast<ShuffleVectorSDNode>(Op)->getMask());
54637 break;
54638 }
54640 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
54641 // -V, INDEX).
54642 SDValue InsVector = Op.getOperand(0);
54643 SDValue InsVal = Op.getOperand(1);
54644 if (!InsVector.isUndef())
54645 return SDValue();
54646 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
54647 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
54648 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
54649 NegInsVal, Op.getOperand(2));
54650 break;
54651 }
54652 case ISD::FSUB:
54653 case ISD::XOR:
54654 case X86ISD::FXOR: {
54655 SDValue Op1 = Op.getOperand(1);
54656 SDValue Op0 = Op.getOperand(0);
54657
54658 // For XOR and FXOR, we want to check if constant
54659 // bits of Op1 are sign bit masks. For FSUB, we
54660 // have to check if constant bits of Op0 are sign
54661 // bit masks and hence we swap the operands.
54662 if (Opc == ISD::FSUB)
54663 std::swap(Op0, Op1);
54664
54665 APInt UndefElts;
54666 SmallVector<APInt, 16> EltBits;
54667 // Extract constant bits and see if they are all
54668 // sign bit masks. Ignore the undef elements.
54669 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
54670 /* AllowWholeUndefs */ true,
54671 /* AllowPartialUndefs */ false)) {
54672 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
54673 if (!UndefElts[I] && !EltBits[I].isSignMask())
54674 return SDValue();
54675
54676 // Only allow bitcast from correctly-sized constant.
54677 Op0 = peekThroughBitcasts(Op0);
54678 if (Op0.getScalarValueSizeInBits() == ScalarSize)
54679 return Op0;
54680 }
54681 break;
54682 } // case
54683 } // switch
54684
54685 return SDValue();
54686}
54687
54688static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
54689 bool NegRes) {
54690 if (NegMul) {
54691 switch (Opcode) {
54692 // clang-format off
54693 default: llvm_unreachable("Unexpected opcode");
54694 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
54695 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
54696 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
54697 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
54698 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
54699 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
54700 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
54701 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
54702 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
54703 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
54704 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
54705 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
54706 // clang-format on
54707 }
54708 }
54709
54710 if (NegAcc) {
54711 switch (Opcode) {
54712 // clang-format off
54713 default: llvm_unreachable("Unexpected opcode");
54714 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
54715 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
54716 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54717 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
54718 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
54719 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54720 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
54721 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
54722 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54723 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
54724 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
54725 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54726 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
54727 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
54728 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
54729 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
54730 // clang-format on
54731 }
54732 }
54733
54734 if (NegRes) {
54735 switch (Opcode) {
54736 // For accuracy reason, we never combine fneg and fma under strict FP.
54737 // clang-format off
54738 default: llvm_unreachable("Unexpected opcode");
54739 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
54740 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54741 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
54742 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54743 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
54744 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54745 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
54746 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54747 // clang-format on
54748 }
54749 }
54750
54751 return Opcode;
54752}
54753
54754/// Do target-specific dag combines on floating point negations.
54757 const X86Subtarget &Subtarget) {
54758 EVT OrigVT = N->getValueType(0);
54759 SDValue Arg = isFNEG(DAG, N);
54760 if (!Arg)
54761 return SDValue();
54762
54763 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54764 EVT VT = Arg.getValueType();
54765 EVT SVT = VT.getScalarType();
54766 SDLoc DL(N);
54767
54768 // Let legalize expand this if it isn't a legal type yet.
54769 if (!TLI.isTypeLegal(VT))
54770 return SDValue();
54771
54772 // If we're negating a FMUL node on a target with FMA, then we can avoid the
54773 // use of a constant by performing (-0 - A*B) instead.
54774 // FIXME: Check rounding control flags as well once it becomes available.
54775 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
54776 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
54777 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
54778 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
54779 Arg.getOperand(1), Zero);
54780 return DAG.getBitcast(OrigVT, NewNode);
54781 }
54782
54784 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54785 if (SDValue NegArg =
54786 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
54787 return DAG.getBitcast(OrigVT, NegArg);
54788
54789 return SDValue();
54790}
54791
54793 bool LegalOperations,
54794 bool ForCodeSize,
54796 unsigned Depth) const {
54797 // fneg patterns are removable even if they have multiple uses.
54798 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
54800 return DAG.getBitcast(Op.getValueType(), Arg);
54801 }
54802
54803 EVT VT = Op.getValueType();
54804 EVT SVT = VT.getScalarType();
54805 unsigned Opc = Op.getOpcode();
54806 SDNodeFlags Flags = Op.getNode()->getFlags();
54807 switch (Opc) {
54808 case ISD::FMA:
54809 case X86ISD::FMSUB:
54810 case X86ISD::FNMADD:
54811 case X86ISD::FNMSUB:
54812 case X86ISD::FMADD_RND:
54813 case X86ISD::FMSUB_RND:
54814 case X86ISD::FNMADD_RND:
54815 case X86ISD::FNMSUB_RND: {
54816 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
54817 !(SVT == MVT::f32 || SVT == MVT::f64) ||
54819 break;
54820
54821 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
54822 // if it may have signed zeros.
54823 if (!Flags.hasNoSignedZeros())
54824 break;
54825
54826 // Because getCheaperNegatedExpression can delete nodes we need a handle to
54827 // keep temporary nodes alive.
54828 std::list<HandleSDNode> Handles;
54829
54830 // This is always negatible for free but we might be able to remove some
54831 // extra operand negations as well.
54832 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
54833 for (int i = 0; i != 3; ++i) {
54834 NewOps[i] = getCheaperNegatedExpression(
54835 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
54836 if (!!NewOps[i])
54837 Handles.emplace_back(NewOps[i]);
54838 }
54839
54840 bool NegA = !!NewOps[0];
54841 bool NegB = !!NewOps[1];
54842 bool NegC = !!NewOps[2];
54843 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
54844
54845 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
54847
54848 // Fill in the non-negated ops with the original values.
54849 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
54850 if (!NewOps[i])
54851 NewOps[i] = Op.getOperand(i);
54852 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
54853 }
54854 case X86ISD::FRCP:
54855 if (SDValue NegOp0 =
54856 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
54857 ForCodeSize, Cost, Depth + 1))
54858 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
54859 break;
54860 }
54861
54862 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
54863 ForCodeSize, Cost, Depth);
54864}
54865
54867 const X86Subtarget &Subtarget) {
54868 MVT VT = N->getSimpleValueType(0);
54869 // If we have integer vector types available, use the integer opcodes.
54870 if (!VT.isVector() || !Subtarget.hasSSE2())
54871 return SDValue();
54872
54873 SDLoc dl(N);
54875 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
54876 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
54877 unsigned IntOpcode;
54878 switch (N->getOpcode()) {
54879 // clang-format off
54880 default: llvm_unreachable("Unexpected FP logic op");
54881 case X86ISD::FOR: IntOpcode = ISD::OR; break;
54882 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
54883 case X86ISD::FAND: IntOpcode = ISD::AND; break;
54884 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
54885 // clang-format on
54886 }
54887 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
54888 return DAG.getBitcast(VT, IntOp);
54889}
54890
54891/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
54893 if (N->getOpcode() != ISD::XOR)
54894 return SDValue();
54895
54896 SDValue LHS = N->getOperand(0);
54897 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
54898 return SDValue();
54899
54901 X86::CondCode(LHS->getConstantOperandVal(0)));
54902 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
54903}
54904
54906 const X86Subtarget &Subtarget) {
54907 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
54908 "Invalid opcode for combing with CTLZ");
54909 if (Subtarget.hasFastLZCNT())
54910 return SDValue();
54911
54912 EVT VT = N->getValueType(0);
54913 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
54914 (VT != MVT::i64 || !Subtarget.is64Bit()))
54915 return SDValue();
54916
54917 SDValue N0 = N->getOperand(0);
54918 SDValue N1 = N->getOperand(1);
54919
54920 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
54922 return SDValue();
54923
54924 SDValue OpCTLZ;
54925 SDValue OpSizeTM1;
54926
54927 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
54928 OpCTLZ = N1;
54929 OpSizeTM1 = N0;
54930 } else if (N->getOpcode() == ISD::SUB) {
54931 return SDValue();
54932 } else {
54933 OpCTLZ = N0;
54934 OpSizeTM1 = N1;
54935 }
54936
54937 if (!OpCTLZ.hasOneUse())
54938 return SDValue();
54939 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
54940 if (!C)
54941 return SDValue();
54942
54943 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
54944 return SDValue();
54945 EVT OpVT = VT;
54946 SDValue Op = OpCTLZ.getOperand(0);
54947 if (VT == MVT::i8) {
54948 // Zero extend to i32 since there is not an i8 bsr.
54949 OpVT = MVT::i32;
54950 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
54951 }
54952
54953 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
54954 Op = DAG.getNode(X86ISD::BSR, DL, VTs, DAG.getUNDEF(OpVT), Op);
54955 if (VT == MVT::i8)
54956 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
54957
54958 return Op;
54959}
54960
54963 const X86Subtarget &Subtarget) {
54964 SDValue N0 = N->getOperand(0);
54965 SDValue N1 = N->getOperand(1);
54966 EVT VT = N->getValueType(0);
54967 SDLoc DL(N);
54968
54969 // If this is SSE1 only convert to FXOR to avoid scalarization.
54970 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
54971 return DAG.getBitcast(MVT::v4i32,
54972 DAG.getNode(X86ISD::FXOR, DL, MVT::v4f32,
54973 DAG.getBitcast(MVT::v4f32, N0),
54974 DAG.getBitcast(MVT::v4f32, N1)));
54975 }
54976
54977 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
54978 return Cmp;
54979
54980 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), DL, N0, N1, DAG))
54981 return R;
54982
54983 if (SDValue R = combineBitOpWithShift(N->getOpcode(), DL, VT, N0, N1, DAG))
54984 return R;
54985
54986 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), DL, VT, N0, N1, DAG))
54987 return R;
54988
54989 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), DL, VT, N0, N1,
54990 DAG, DCI, Subtarget))
54991 return FPLogic;
54992
54993 if (SDValue R = combineXorSubCTLZ(N, DL, DAG, Subtarget))
54994 return R;
54995
54996 if (DCI.isBeforeLegalizeOps())
54997 return SDValue();
54998
54999 if (SDValue SetCC = foldXor1SetCC(N, DL, DAG))
55000 return SetCC;
55001
55002 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), DL, VT, N0, N1, DAG))
55003 return R;
55004
55005 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DL, DAG))
55006 return RV;
55007
55008 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
55009 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55010 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
55011 N0.getOperand(0).getValueType().isVector() &&
55012 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55013 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
55014 return DAG.getBitcast(
55015 VT, DAG.getNOT(DL, N0.getOperand(0), N0.getOperand(0).getValueType()));
55016 }
55017
55018 // Handle AVX512 mask widening.
55019 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
55020 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
55021 VT.getVectorElementType() == MVT::i1 &&
55023 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
55024 return DAG.getNode(
55026 DAG.getNOT(DL, N0.getOperand(1), N0.getOperand(1).getValueType()),
55027 N0.getOperand(2));
55028 }
55029
55030 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
55031 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
55032 // TODO: Under what circumstances could this be performed in DAGCombine?
55033 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
55034 N0.getOperand(0).getOpcode() == N->getOpcode()) {
55035 SDValue TruncExtSrc = N0.getOperand(0);
55036 auto *N1C = dyn_cast<ConstantSDNode>(N1);
55037 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
55038 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
55039 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
55040 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
55041 return DAG.getNode(ISD::XOR, DL, VT, LHS,
55042 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
55043 }
55044 }
55045
55046 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
55047 return R;
55048
55049 return combineFneg(N, DAG, DCI, Subtarget);
55050}
55051
55054 const X86Subtarget &Subtarget) {
55055 SDValue N0 = N->getOperand(0);
55056 EVT VT = N->getValueType(0);
55057
55058 // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
55059 if (VT.isInteger() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
55060 SDValue Src = N0.getOperand(0);
55061 EVT SrcVT = Src.getValueType();
55062 if (SrcVT.isVector() && SrcVT.getScalarType() == MVT::i1 &&
55063 (DCI.isBeforeLegalize() ||
55064 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) &&
55065 Subtarget.hasSSSE3()) {
55066 unsigned NumElts = SrcVT.getVectorNumElements();
55067 SmallVector<int, 32> ReverseMask(NumElts);
55068 for (unsigned I = 0; I != NumElts; ++I)
55069 ReverseMask[I] = (NumElts - 1) - I;
55070 SDValue Rev =
55071 DAG.getVectorShuffle(SrcVT, SDLoc(N), Src, Src, ReverseMask);
55072 return DAG.getBitcast(VT, Rev);
55073 }
55074 }
55075
55076 return SDValue();
55077}
55078
55079// Various combines to try to convert to avgceilu.
55082 const X86Subtarget &Subtarget) {
55083 unsigned Opcode = N->getOpcode();
55084 SDValue N0 = N->getOperand(0);
55085 SDValue N1 = N->getOperand(1);
55086 EVT VT = N->getValueType(0);
55087 EVT SVT = VT.getScalarType();
55088 SDLoc DL(N);
55089
55090 // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y)))
55091 // Only useful on vXi8 which doesn't have good SRA handling.
55092 if (Opcode == ISD::AVGCEILS && VT.isVector() && SVT == MVT::i8) {
55094 SDValue SignMask = DAG.getConstant(SignBit, DL, VT);
55095 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, SignMask);
55096 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, SignMask);
55097 return DAG.getNode(ISD::XOR, DL, VT,
55098 DAG.getNode(ISD::AVGCEILU, DL, VT, N0, N1), SignMask);
55099 }
55100
55101 return SDValue();
55102}
55103
55106 const X86Subtarget &Subtarget) {
55107 EVT VT = N->getValueType(0);
55108 unsigned NumBits = VT.getSizeInBits();
55109
55110 // TODO - Constant Folding.
55111
55112 // Simplify the inputs.
55113 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55114 APInt DemandedMask(APInt::getAllOnes(NumBits));
55115 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
55116 return SDValue(N, 0);
55117
55118 return SDValue();
55119}
55120
55122 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
55123}
55124
55125/// If a value is a scalar FP zero or a vector FP zero (potentially including
55126/// undefined elements), return a zero constant that may be used to fold away
55127/// that value. In the case of a vector, the returned constant will not contain
55128/// undefined elements even if the input parameter does. This makes it suitable
55129/// to be used as a replacement operand with operations (eg, bitwise-and) where
55130/// an undef should not propagate.
55132 const X86Subtarget &Subtarget) {
55134 return SDValue();
55135
55136 if (V.getValueType().isVector())
55137 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
55138
55139 return V;
55140}
55141
55143 const X86Subtarget &Subtarget) {
55144 SDValue N0 = N->getOperand(0);
55145 SDValue N1 = N->getOperand(1);
55146 EVT VT = N->getValueType(0);
55147 SDLoc DL(N);
55148
55149 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
55150 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
55151 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
55152 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
55153 return SDValue();
55154
55155 auto isAllOnesConstantFP = [](SDValue V) {
55156 if (V.getSimpleValueType().isVector())
55157 return ISD::isBuildVectorAllOnes(V.getNode());
55158 auto *C = dyn_cast<ConstantFPSDNode>(V);
55159 return C && C->getConstantFPValue()->isAllOnesValue();
55160 };
55161
55162 // fand (fxor X, -1), Y --> fandn X, Y
55163 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
55164 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
55165
55166 // fand X, (fxor Y, -1) --> fandn Y, X
55167 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
55168 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
55169
55170 return SDValue();
55171}
55172
55173/// Do target-specific dag combines on X86ISD::FAND nodes.
55175 const X86Subtarget &Subtarget) {
55176 // FAND(0.0, x) -> 0.0
55177 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
55178 return V;
55179
55180 // FAND(x, 0.0) -> 0.0
55181 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55182 return V;
55183
55184 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
55185 return V;
55186
55187 return lowerX86FPLogicOp(N, DAG, Subtarget);
55188}
55189
55190/// Do target-specific dag combines on X86ISD::FANDN nodes.
55192 const X86Subtarget &Subtarget) {
55193 // FANDN(0.0, x) -> x
55194 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55195 return N->getOperand(1);
55196
55197 // FANDN(x, 0.0) -> 0.0
55198 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55199 return V;
55200
55201 return lowerX86FPLogicOp(N, DAG, Subtarget);
55202}
55203
55204/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
55207 const X86Subtarget &Subtarget) {
55208 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
55209
55210 // F[X]OR(0.0, x) -> x
55211 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55212 return N->getOperand(1);
55213
55214 // F[X]OR(x, 0.0) -> x
55215 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
55216 return N->getOperand(0);
55217
55218 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
55219 return NewVal;
55220
55221 return lowerX86FPLogicOp(N, DAG, Subtarget);
55222}
55223
55224/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
55226 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
55227
55228 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
55229 if (!DAG.getTarget().Options.NoNaNsFPMath ||
55231 return SDValue();
55232
55233 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
55234 // into FMINC and FMAXC, which are Commutative operations.
55235 unsigned NewOp = 0;
55236 switch (N->getOpcode()) {
55237 default: llvm_unreachable("unknown opcode");
55238 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
55239 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
55240 }
55241
55242 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
55243 N->getOperand(0), N->getOperand(1));
55244}
55245
55247 const X86Subtarget &Subtarget) {
55248 EVT VT = N->getValueType(0);
55249 if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
55250 return SDValue();
55251
55252 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55253
55254 auto IsMinMaxLegal = [&](EVT VT) {
55255 if (!TLI.isTypeLegal(VT))
55256 return false;
55257 return VT.getScalarType() != MVT::f16 ||
55258 (Subtarget.hasFP16() && (VT == MVT::v32f16 || Subtarget.hasVLX()));
55259 };
55260
55261 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
55262 (Subtarget.hasSSE2() && VT == MVT::f64) ||
55263 (Subtarget.hasFP16() && VT == MVT::f16) ||
55264 (VT.isVector() && IsMinMaxLegal(VT))))
55265 return SDValue();
55266
55267 SDValue Op0 = N->getOperand(0);
55268 SDValue Op1 = N->getOperand(1);
55269 SDLoc DL(N);
55270 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
55271
55272 // If we don't have to respect NaN inputs, this is a direct translation to x86
55273 // min/max instructions.
55274 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
55275 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55276
55277 // If one of the operands is known non-NaN use the native min/max instructions
55278 // with the non-NaN input as second operand.
55279 if (DAG.isKnownNeverNaN(Op1))
55280 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55281 if (DAG.isKnownNeverNaN(Op0))
55282 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
55283
55284 // If we have to respect NaN inputs, this takes at least 3 instructions.
55285 // Favor a library call when operating on a scalar and minimizing code size.
55286 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
55287 return SDValue();
55288
55289 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
55290 VT);
55291
55292 // There are 4 possibilities involving NaN inputs, and these are the required
55293 // outputs:
55294 // Op1
55295 // Num NaN
55296 // ----------------
55297 // Num | Max | Op0 |
55298 // Op0 ----------------
55299 // NaN | Op1 | NaN |
55300 // ----------------
55301 //
55302 // The SSE FP max/min instructions were not designed for this case, but rather
55303 // to implement:
55304 // Min = Op1 < Op0 ? Op1 : Op0
55305 // Max = Op1 > Op0 ? Op1 : Op0
55306 //
55307 // So they always return Op0 if either input is a NaN. However, we can still
55308 // use those instructions for fmaxnum by selecting away a NaN input.
55309
55310 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
55311 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
55312 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
55313
55314 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
55315 // are NaN, the NaN value of Op1 is the result.
55316 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
55317}
55318
55321 EVT VT = N->getValueType(0);
55322 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55323
55324 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
55325 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
55326 return SDValue(N, 0);
55327
55328 // Convert a full vector load into vzload when not all bits are needed.
55329 SDValue In = N->getOperand(0);
55330 MVT InVT = In.getSimpleValueType();
55331 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55332 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55333 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55334 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
55335 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55336 MVT MemVT = MVT::getIntegerVT(NumBits);
55337 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55338 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55339 SDLoc dl(N);
55340 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
55341 DAG.getBitcast(InVT, VZLoad));
55342 DCI.CombineTo(N, Convert);
55343 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55345 return SDValue(N, 0);
55346 }
55347 }
55348
55349 return SDValue();
55350}
55351
55355 bool IsStrict = TSI.isTargetStrictFPOpcode(N->getOpcode());
55356 EVT VT = N->getValueType(0);
55357
55358 // Convert a full vector load into vzload when not all bits are needed.
55359 SDValue In = N->getOperand(IsStrict ? 1 : 0);
55360 MVT InVT = In.getSimpleValueType();
55361 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55362 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55363 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55364 LoadSDNode *LN = cast<LoadSDNode>(In);
55365 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55366 MVT MemVT = MVT::getFloatingPointVT(NumBits);
55367 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55368 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55369 SDLoc dl(N);
55370 if (IsStrict) {
55371 SDValue Convert =
55372 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
55373 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
55374 DCI.CombineTo(N, Convert, Convert.getValue(1));
55375 } else {
55376 SDValue Convert =
55377 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
55378 DCI.CombineTo(N, Convert);
55379 }
55380 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55382 return SDValue(N, 0);
55383 }
55384 }
55385
55386 return SDValue();
55387}
55388
55389/// Do target-specific dag combines on X86ISD::ANDNP nodes.
55392 const X86Subtarget &Subtarget) {
55393 SDValue N0 = N->getOperand(0);
55394 SDValue N1 = N->getOperand(1);
55395 MVT VT = N->getSimpleValueType(0);
55396 int NumElts = VT.getVectorNumElements();
55397 unsigned EltSizeInBits = VT.getScalarSizeInBits();
55398 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55399 SDLoc DL(N);
55400
55401 // ANDNP(undef, x) -> 0
55402 // ANDNP(x, undef) -> 0
55403 if (N0.isUndef() || N1.isUndef())
55404 return DAG.getConstant(0, DL, VT);
55405
55406 // ANDNP(0, x) -> x
55408 return N1;
55409
55410 // ANDNP(x, 0) -> 0
55412 return DAG.getConstant(0, DL, VT);
55413
55414 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
55416 return DAG.getNOT(DL, N0, VT);
55417
55418 // Turn ANDNP back to AND if input is inverted.
55419 if (SDValue Not = IsNOT(N0, DAG))
55420 return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
55421
55422 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
55423 // to make use of predicated selects.
55424 // ANDN(SEXT(SETCC()),X) -> SELECT(NOT(SETCC()),X,0)
55425 if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::SIGN_EXTEND) {
55426 SDValue Src = N0.getOperand(0);
55427 EVT SrcVT = Src.getValueType();
55428 if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 &&
55429 (VT.is512BitVector() || Subtarget.hasVLX()) &&
55430 (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
55431 TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse())
55432 return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1,
55433 getZeroVector(VT, Subtarget, DAG, DL));
55434 }
55435
55436 // Constant Folding
55437 APInt Undefs0, Undefs1;
55438 SmallVector<APInt> EltBits0, EltBits1;
55439 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0,
55440 /*AllowWholeUndefs*/ true,
55441 /*AllowPartialUndefs*/ true)) {
55442 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1,
55443 /*AllowWholeUndefs*/ true,
55444 /*AllowPartialUndefs*/ true)) {
55445 SmallVector<APInt> ResultBits;
55446 for (int I = 0; I != NumElts; ++I)
55447 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
55448 return getConstVector(ResultBits, VT, DAG, DL);
55449 }
55450
55451 // Constant fold NOT(N0) to allow us to use AND.
55452 // Ensure this is only performed if we can confirm that the bitcasted source
55453 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
55454 if (N0->hasOneUse()) {
55456 if (BC0.getOpcode() != ISD::BITCAST) {
55457 for (APInt &Elt : EltBits0)
55458 Elt = ~Elt;
55459 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
55460 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
55461 }
55462 }
55463 }
55464
55465 // Attempt to recursively combine a bitmask ANDNP with shuffles.
55466 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
55467 SDValue Op(N, 0);
55468 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
55469 return Res;
55470
55471 // If either operand is a constant mask, then only the elements that aren't
55472 // zero are actually demanded by the other operand.
55473 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
55474 APInt UndefElts;
55475 SmallVector<APInt> EltBits;
55476 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
55477 APInt DemandedElts = APInt::getAllOnes(NumElts);
55478 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
55479 EltBits)) {
55480 DemandedBits.clearAllBits();
55481 DemandedElts.clearAllBits();
55482 for (int I = 0; I != NumElts; ++I) {
55483 if (UndefElts[I]) {
55484 // We can't assume an undef src element gives an undef dst - the
55485 // other src might be zero.
55486 DemandedBits.setAllBits();
55487 DemandedElts.setBit(I);
55488 } else if ((Invert && !EltBits[I].isAllOnes()) ||
55489 (!Invert && !EltBits[I].isZero())) {
55490 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
55491 DemandedElts.setBit(I);
55492 }
55493 }
55494 }
55495 return std::make_pair(DemandedBits, DemandedElts);
55496 };
55497 APInt Bits0, Elts0;
55498 APInt Bits1, Elts1;
55499 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
55500 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
55501
55502 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
55503 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
55504 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
55505 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
55506 if (N->getOpcode() != ISD::DELETED_NODE)
55507 DCI.AddToWorklist(N);
55508 return SDValue(N, 0);
55509 }
55510 }
55511
55512 // Folds for better commutativity:
55513 if (N1->hasOneUse()) {
55514 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
55515 if (SDValue Not = IsNOT(N1, DAG))
55516 return DAG.getNOT(
55517 DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
55518
55519 // ANDNP(x,PSHUFB(y,z)) -> PSHUFB(y,OR(z,x))
55520 // Zero out elements by setting the PSHUFB mask value to 0xFF.
55521 if (DAG.ComputeNumSignBits(N0) == EltSizeInBits) {
55523 if (BC1.getOpcode() == X86ISD::PSHUFB) {
55524 EVT ShufVT = BC1.getValueType();
55525 SDValue NewMask = DAG.getNode(ISD::OR, DL, ShufVT, BC1.getOperand(1),
55526 DAG.getBitcast(ShufVT, N0));
55527 SDValue NewShuf =
55528 DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, BC1.getOperand(0), NewMask);
55529 return DAG.getBitcast(VT, NewShuf);
55530 }
55531 }
55532 }
55533
55534 return SDValue();
55535}
55536
55539 SDValue N1 = N->getOperand(1);
55540
55541 // BT ignores high bits in the bit index operand.
55542 unsigned BitWidth = N1.getValueSizeInBits();
55544 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
55545 if (N->getOpcode() != ISD::DELETED_NODE)
55546 DCI.AddToWorklist(N);
55547 return SDValue(N, 0);
55548 }
55549
55550 return SDValue();
55551}
55552
55555 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
55556 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
55557
55558 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
55559 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55560 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
55561 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
55562 if (N->getOpcode() != ISD::DELETED_NODE)
55563 DCI.AddToWorklist(N);
55564 return SDValue(N, 0);
55565 }
55566
55567 // Convert a full vector load into vzload when not all bits are needed.
55568 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
55569 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
55570 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
55571 SDLoc dl(N);
55572 if (IsStrict) {
55573 SDValue Convert = DAG.getNode(
55574 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
55575 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
55576 DCI.CombineTo(N, Convert, Convert.getValue(1));
55577 } else {
55578 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
55579 DAG.getBitcast(MVT::v8i16, VZLoad));
55580 DCI.CombineTo(N, Convert);
55581 }
55582
55583 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55585 return SDValue(N, 0);
55586 }
55587 }
55588 }
55589
55590 return SDValue();
55591}
55592
55593// Try to combine sext_in_reg of a cmov of constants by extending the constants.
55595 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55596
55597 EVT DstVT = N->getValueType(0);
55598
55599 SDValue N0 = N->getOperand(0);
55600 SDValue N1 = N->getOperand(1);
55601 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55602
55603 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
55604 return SDValue();
55605
55606 // Look through single use any_extends / truncs.
55607 SDValue IntermediateBitwidthOp;
55608 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
55609 N0.hasOneUse()) {
55610 IntermediateBitwidthOp = N0;
55611 N0 = N0.getOperand(0);
55612 }
55613
55614 // See if we have a single use cmov.
55615 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
55616 return SDValue();
55617
55618 SDValue CMovOp0 = N0.getOperand(0);
55619 SDValue CMovOp1 = N0.getOperand(1);
55620
55621 // Make sure both operands are constants.
55622 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55623 !isa<ConstantSDNode>(CMovOp1.getNode()))
55624 return SDValue();
55625
55626 SDLoc DL(N);
55627
55628 // If we looked through an any_extend/trunc above, add one to the constants.
55629 if (IntermediateBitwidthOp) {
55630 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
55631 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
55632 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
55633 }
55634
55635 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
55636 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
55637
55638 EVT CMovVT = DstVT;
55639 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
55640 if (DstVT == MVT::i16) {
55641 CMovVT = MVT::i32;
55642 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
55643 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
55644 }
55645
55646 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
55647 N0.getOperand(2), N0.getOperand(3));
55648
55649 if (CMovVT != DstVT)
55650 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
55651
55652 return CMov;
55653}
55654
55656 const X86Subtarget &Subtarget) {
55657 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55658
55659 if (SDValue V = combineSextInRegCmov(N, DAG))
55660 return V;
55661
55662 EVT VT = N->getValueType(0);
55663 SDValue N0 = N->getOperand(0);
55664 SDValue N1 = N->getOperand(1);
55665 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55666 SDLoc dl(N);
55667
55668 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
55669 // both SSE and AVX2 since there is no sign-extended shift right
55670 // operation on a vector with 64-bit elements.
55671 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
55672 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
55673 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
55674 N0.getOpcode() == ISD::SIGN_EXTEND)) {
55675 SDValue N00 = N0.getOperand(0);
55676
55677 // EXTLOAD has a better solution on AVX2,
55678 // it may be replaced with X86ISD::VSEXT node.
55679 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
55680 if (!ISD::isNormalLoad(N00.getNode()))
55681 return SDValue();
55682
55683 // Attempt to promote any comparison mask ops before moving the
55684 // SIGN_EXTEND_INREG in the way.
55685 if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))
55686 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
55687
55688 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
55689 SDValue Tmp =
55690 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
55691 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
55692 }
55693 }
55694 return SDValue();
55695}
55696
55697/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
55698/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
55699/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
55700/// opportunities to combine math ops, use an LEA, or use a complex addressing
55701/// mode. This can eliminate extend, add, and shift instructions.
55703 const X86Subtarget &Subtarget) {
55704 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
55705 Ext->getOpcode() != ISD::ZERO_EXTEND)
55706 return SDValue();
55707
55708 // TODO: This should be valid for other integer types.
55709 EVT VT = Ext->getValueType(0);
55710 if (VT != MVT::i64)
55711 return SDValue();
55712
55713 SDValue Add = Ext->getOperand(0);
55714 if (Add.getOpcode() != ISD::ADD)
55715 return SDValue();
55716
55717 SDValue AddOp0 = Add.getOperand(0);
55718 SDValue AddOp1 = Add.getOperand(1);
55719 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
55720 bool NSW = Add->getFlags().hasNoSignedWrap();
55721 bool NUW = Add->getFlags().hasNoUnsignedWrap();
55722 NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
55723 NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
55724
55725 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
55726 // into the 'zext'
55727 if ((Sext && !NSW) || (!Sext && !NUW))
55728 return SDValue();
55729
55730 // Having a constant operand to the 'add' ensures that we are not increasing
55731 // the instruction count because the constant is extended for free below.
55732 // A constant operand can also become the displacement field of an LEA.
55733 auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
55734 if (!AddOp1C)
55735 return SDValue();
55736
55737 // Don't make the 'add' bigger if there's no hope of combining it with some
55738 // other 'add' or 'shl' instruction.
55739 // TODO: It may be profitable to generate simpler LEA instructions in place
55740 // of single 'add' instructions, but the cost model for selecting an LEA
55741 // currently has a high threshold.
55742 bool HasLEAPotential = false;
55743 for (auto *User : Ext->users()) {
55744 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
55745 HasLEAPotential = true;
55746 break;
55747 }
55748 }
55749 if (!HasLEAPotential)
55750 return SDValue();
55751
55752 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
55753 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
55754 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
55755 SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
55756
55757 // The wider add is guaranteed to not wrap because both operands are
55758 // sign-extended.
55759 SDNodeFlags Flags;
55760 Flags.setNoSignedWrap(NSW);
55761 Flags.setNoUnsignedWrap(NUW);
55762 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
55763}
55764
55765// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
55766// operands and the result of CMOV is not used anywhere else - promote CMOV
55767// itself instead of promoting its result. This could be beneficial, because:
55768// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
55769// (or more) pseudo-CMOVs only when they go one-after-another and
55770// getting rid of result extension code after CMOV will help that.
55771// 2) Promotion of constant CMOV arguments is free, hence the
55772// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
55773// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
55774// promotion is also good in terms of code-size.
55775// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
55776// promotion).
55778 SDValue CMovN = Extend->getOperand(0);
55779 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
55780 return SDValue();
55781
55782 EVT TargetVT = Extend->getValueType(0);
55783 unsigned ExtendOpcode = Extend->getOpcode();
55784 SDLoc DL(Extend);
55785
55786 EVT VT = CMovN.getValueType();
55787 SDValue CMovOp0 = CMovN.getOperand(0);
55788 SDValue CMovOp1 = CMovN.getOperand(1);
55789
55790 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55791 !isa<ConstantSDNode>(CMovOp1.getNode()))
55792 return SDValue();
55793
55794 // Only extend to i32 or i64.
55795 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
55796 return SDValue();
55797
55798 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
55799 // are free.
55800 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
55801 return SDValue();
55802
55803 // If this a zero extend to i64, we should only extend to i32 and use a free
55804 // zero extend to finish.
55805 EVT ExtendVT = TargetVT;
55806 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
55807 ExtendVT = MVT::i32;
55808
55809 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
55810 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
55811
55812 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
55813 CMovN.getOperand(2), CMovN.getOperand(3));
55814
55815 // Finish extending if needed.
55816 if (ExtendVT != TargetVT)
55817 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
55818
55819 return Res;
55820}
55821
55822// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
55823// result type.
55825 const X86Subtarget &Subtarget) {
55826 SDValue N0 = N->getOperand(0);
55827 EVT VT = N->getValueType(0);
55828 SDLoc dl(N);
55829
55830 // Only do this combine with AVX512 for vector extends.
55831 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
55832 return SDValue();
55833
55834 // Only combine legal element types.
55835 EVT SVT = VT.getVectorElementType();
55836 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
55837 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
55838 return SDValue();
55839
55840 // We don't have CMPP Instruction for vxf16
55841 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
55842 return SDValue();
55843 // We can only do this if the vector size in 256 bits or less.
55844 unsigned Size = VT.getSizeInBits();
55845 if (Size > 256 && Subtarget.useAVX512Regs())
55846 return SDValue();
55847
55848 EVT N00VT = N0.getOperand(0).getValueType();
55849
55850 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
55851 // that's the only integer compares with we have.
55853 if (N00VT.isInteger() && ISD::isUnsignedIntSetCC(CC))
55854 return SDValue();
55855
55856 // Only do this combine if the extension will be fully consumed by the setcc.
55857 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
55858 if (Size != MatchingVecType.getSizeInBits())
55859 return SDValue();
55860
55861 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
55862
55863 if (N->getOpcode() == ISD::ZERO_EXTEND)
55864 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
55865
55866 return Res;
55867}
55868
55871 const X86Subtarget &Subtarget) {
55872 SDValue N0 = N->getOperand(0);
55873 EVT VT = N->getValueType(0);
55874 SDLoc DL(N);
55875
55876 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
55877 if (!DCI.isBeforeLegalizeOps() &&
55879 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
55880 N0->getOperand(1));
55881 bool ReplaceOtherUses = !N0.hasOneUse();
55882 DCI.CombineTo(N, Setcc);
55883 // Replace other uses with a truncate of the widened setcc_carry.
55884 if (ReplaceOtherUses) {
55885 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
55886 N0.getValueType(), Setcc);
55887 DCI.CombineTo(N0.getNode(), Trunc);
55888 }
55889
55890 return SDValue(N, 0);
55891 }
55892
55893 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
55894 return NewCMov;
55895
55896 if (!DCI.isBeforeLegalizeOps())
55897 return SDValue();
55898
55899 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
55900 return V;
55901
55902 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
55903 DAG, DCI, Subtarget))
55904 return V;
55905
55906 if (VT.isVector()) {
55907 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
55908 return R;
55909
55911 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
55912 }
55913
55914 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
55915 return NewAdd;
55916
55917 return SDValue();
55918}
55919
55920// Inverting a constant vector is profitable if it can be eliminated and the
55921// inverted vector is already present in DAG. Otherwise, it will be loaded
55922// anyway.
55923//
55924// We determine which of the values can be completely eliminated and invert it.
55925// If both are eliminable, select a vector with the first negative element.
55928 "ConstantFP build vector expected");
55929 // Check if we can eliminate V. We assume if a value is only used in FMAs, we
55930 // can eliminate it. Since this function is invoked for each FMA with this
55931 // vector.
55932 auto IsNotFMA = [](SDNode *User) {
55933 return User->getOpcode() != ISD::FMA &&
55934 User->getOpcode() != ISD::STRICT_FMA;
55935 };
55936 if (llvm::any_of(V->users(), IsNotFMA))
55937 return SDValue();
55938
55940 EVT VT = V.getValueType();
55941 EVT EltVT = VT.getVectorElementType();
55942 for (const SDValue &Op : V->op_values()) {
55943 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
55944 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
55945 } else {
55946 assert(Op.isUndef());
55947 Ops.push_back(DAG.getUNDEF(EltVT));
55948 }
55949 }
55950
55952 if (!NV)
55953 return SDValue();
55954
55955 // If an inverted version cannot be eliminated, choose it instead of the
55956 // original version.
55957 if (llvm::any_of(NV->users(), IsNotFMA))
55958 return SDValue(NV, 0);
55959
55960 // If the inverted version also can be eliminated, we have to consistently
55961 // prefer one of the values. We prefer a constant with a negative value on
55962 // the first place.
55963 // N.B. We need to skip undefs that may precede a value.
55964 for (const SDValue &Op : V->op_values()) {
55965 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
55966 if (Cst->isNegative())
55967 return SDValue();
55968 break;
55969 }
55970 }
55971 return SDValue(NV, 0);
55972}
55973
55976 const X86Subtarget &Subtarget) {
55977 SDLoc dl(N);
55978 EVT VT = N->getValueType(0);
55980 bool IsStrict = N->isTargetOpcode()
55981 ? TSI.isTargetStrictFPOpcode(N->getOpcode())
55982 : N->isStrictFPOpcode();
55983
55984 // Let legalize expand this if it isn't a legal type yet.
55985 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55986 if (!TLI.isTypeLegal(VT))
55987 return SDValue();
55988
55989 SDValue A = N->getOperand(IsStrict ? 1 : 0);
55990 SDValue B = N->getOperand(IsStrict ? 2 : 1);
55991 SDValue C = N->getOperand(IsStrict ? 3 : 2);
55992
55993 // If the operation allows fast-math and the target does not support FMA,
55994 // split this into mul+add to avoid libcall(s).
55995 SDNodeFlags Flags = N->getFlags();
55996 if (!IsStrict && Flags.hasAllowReassociation() &&
55997 TLI.isOperationExpand(ISD::FMA, VT)) {
55998 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
55999 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
56000 }
56001
56002 EVT ScalarVT = VT.getScalarType();
56003 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
56004 !Subtarget.hasAnyFMA()) &&
56005 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()) &&
56006 !(ScalarVT == MVT::bf16 && Subtarget.hasAVX10_2()))
56007 return SDValue();
56008
56009 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
56011 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56012 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
56013 CodeSize)) {
56014 V = NegV;
56015 return true;
56016 }
56017 // Look through extract_vector_elts. If it comes from an FNEG, create a
56018 // new extract from the FNEG input.
56019 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
56020 isNullConstant(V.getOperand(1))) {
56021 SDValue Vec = V.getOperand(0);
56022 if (SDValue NegV = TLI.getCheaperNegatedExpression(
56023 Vec, DAG, LegalOperations, CodeSize)) {
56024 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
56025 NegV, V.getOperand(1));
56026 return true;
56027 }
56028 }
56029 // Lookup if there is an inverted version of constant vector V in DAG.
56030 if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
56031 if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
56032 V = NegV;
56033 return true;
56034 }
56035 }
56036 return false;
56037 };
56038
56039 // Do not convert the passthru input of scalar intrinsics.
56040 // FIXME: We could allow negations of the lower element only.
56041 bool NegA = invertIfNegative(A);
56042 // Create a dummy use for A so that in the process of negating B or C
56043 // recursively, it is not deleted.
56044 HandleSDNode NegAHandle(A);
56045 bool NegB = invertIfNegative(B);
56046 // Similar to A, get a handle on B.
56047 HandleSDNode NegBHandle(B);
56048 bool NegC = invertIfNegative(C);
56049
56050 if (!NegA && !NegB && !NegC)
56051 return SDValue();
56052
56053 unsigned NewOpcode =
56054 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
56055
56056 // Propagate fast-math-flags to new FMA node.
56057 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
56058 if (IsStrict) {
56059 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
56060 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
56061 {N->getOperand(0), A, B, C});
56062 } else {
56063 if (N->getNumOperands() == 4)
56064 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
56065 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
56066 }
56067}
56068
56069// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
56070// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
56073 SDLoc dl(N);
56074 EVT VT = N->getValueType(0);
56075 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56077 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56078
56079 SDValue N2 = N->getOperand(2);
56080
56081 SDValue NegN2 =
56082 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
56083 if (!NegN2)
56084 return SDValue();
56085 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
56086
56087 if (N->getNumOperands() == 4)
56088 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56089 NegN2, N->getOperand(3));
56090 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56091 NegN2);
56092}
56093
56094// Try to widen the build vector and bitcast it to the type of zext.
56095// This is a special case for the 128-bit vector types. Intention is to remove
56096// the zext and replace it with a bitcast the wider type. While lowering
56097// the bitcast is removed and extra commutation due to zext is avoided.
56098// For example:
56099// zext v4i16 ( v4i8 build_vector (x, y, z, w)) -> bitcast v4i16 ( v8i8
56100// build_vector (x, 0, y, 0, z, w, 0)
56102
56103 if (Extend->getOpcode() != ISD::ZERO_EXTEND)
56104 return SDValue();
56105
56106 EVT ExtendVT = Extend->getValueType(0);
56107
56108 SDValue BV = Extend->getOperand(0);
56109 if (BV.getOpcode() != ISD::BUILD_VECTOR || !BV.hasOneUse())
56110 return SDValue();
56111
56112 if (any_of(BV->op_values(), [](SDValue Op) { return Op.isUndef(); })) {
56113 // If the build vector has undef elements, we cannot widen it.
56114 // The widening would create a vector with more undef elements, which
56115 // is not valid.
56116 return SDValue();
56117 }
56118
56119 if (!all_of(BV->op_values(),
56120 [](SDValue Op) { return Op.getOpcode() == ISD::LOAD; })) {
56121 // If the build vector any element other than \ISD::LOAD, we cannot widen
56122 // it.
56123 return SDValue();
56124 }
56125
56126 SDLoc dl(BV);
56127 EVT VT = BV.getValueType();
56128 EVT EltVT = BV.getOperand(0).getValueType();
56129 unsigned NumElts = VT.getVectorNumElements();
56130
56131 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56132
56133 if (TLI.getTypeAction(*DAG.getContext(), VT) !=
56135 return SDValue();
56136
56137 EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
56138 unsigned WidenNumElts = WidenVT.getVectorNumElements();
56139
56140 SmallVector<SDValue, 16> NewOps(BV->op_begin(), BV->op_end());
56141 assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
56142 // Fill the new elements with Zero.
56143 NewOps.append(WidenNumElts - NumElts, DAG.getConstant(0, dl, EltVT));
56144 // Compute the step to place the elements in the right place and control the
56145 // iteration.
56146 unsigned step = WidenNumElts / NumElts;
56147 if (WidenVT.is128BitVector()) {
56148 if (step > 1 && Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) {
56149 for (int i = NumElts - 1, j = WidenNumElts - step; i > 0;
56150 i--, j -= step) {
56151 SDValue temp = NewOps[i];
56152 NewOps[i] = NewOps[j];
56153 NewOps[j] = temp;
56154 }
56155 // Create new build vector with WidenVT and NewOps
56156 SDValue NewBV = DAG.getBuildVector(WidenVT, dl, NewOps);
56157 // Replace the old build vector with the new one. Bitcast the
56158 // new build vector to the type of the zext.
56159 SDValue NewBVBitcast = DAG.getBitcast(ExtendVT, NewBV);
56160 DAG.ReplaceAllUsesOfValueWith(SDValue(Extend, 0), NewBVBitcast);
56161 return NewBV;
56162 }
56163 }
56164 return SDValue();
56165}
56166
56169 const X86Subtarget &Subtarget) {
56170 SDLoc dl(N);
56171 SDValue N0 = N->getOperand(0);
56172 EVT VT = N->getValueType(0);
56173
56174 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
56175 // FIXME: Is this needed? We don't seem to have any tests for it.
56176 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
56178 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
56179 N0->getOperand(1));
56180 bool ReplaceOtherUses = !N0.hasOneUse();
56181 DCI.CombineTo(N, Setcc);
56182 // Replace other uses with a truncate of the widened setcc_carry.
56183 if (ReplaceOtherUses) {
56184 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
56185 N0.getValueType(), Setcc);
56186 DCI.CombineTo(N0.getNode(), Trunc);
56187 }
56188
56189 return SDValue(N, 0);
56190 }
56191
56192 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
56193 return NewCMov;
56194
56195 if (DCI.isBeforeLegalizeOps())
56196 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
56197 return V;
56198
56199 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
56200 DAG, DCI, Subtarget))
56201 return V;
56202
56203 if (VT.isVector())
56204 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))
56205 return R;
56206
56207 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
56208 return NewAdd;
56209
56210 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
56211 return R;
56212
56213 // TODO: Combine with any target/faux shuffle.
56214 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
56216 SDValue N00 = N0.getOperand(0);
56217 SDValue N01 = N0.getOperand(1);
56218 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
56219 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
56220 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
56221 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
56222 return concatSubVectors(N00, N01, DAG, dl);
56223 }
56224 }
56225
56226 if (SDValue V = widenBuildVec(N, DAG))
56227 return V;
56228
56229 return SDValue();
56230}
56231
56232/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
56233/// pre-promote its result type since vXi1 vectors don't get promoted
56234/// during type legalization.
56237 const SDLoc &DL, SelectionDAG &DAG,
56238 const X86Subtarget &Subtarget) {
56239 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
56240 VT.getVectorElementType() == MVT::i1 &&
56241 (OpVT.getVectorElementType() == MVT::i8 ||
56242 OpVT.getVectorElementType() == MVT::i16)) {
56243 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
56244 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
56245 }
56246 return SDValue();
56247}
56248
56249// The pattern (setcc (and (broadcast x), (2^n, 2^{n+1}, ...)), (0, 0, ...),
56250// eq/ne) is generated when using an integer as a mask. Instead of generating a
56251// broadcast + vptest, we can directly move the integer to a mask register.
56253 const SDLoc &DL, SelectionDAG &DAG,
56254 const X86Subtarget &Subtarget) {
56255 if (CC != ISD::SETNE && CC != ISD::SETEQ)
56256 return SDValue();
56257
56258 if (!Subtarget.hasAVX512())
56259 return SDValue();
56260
56261 if (Op0.getOpcode() != ISD::AND)
56262 return SDValue();
56263
56264 SDValue Broadcast = Op0.getOperand(0);
56265 if (Broadcast.getOpcode() != X86ISD::VBROADCAST &&
56266 Broadcast.getOpcode() != X86ISD::VBROADCAST_LOAD)
56267 return SDValue();
56268
56269 SDValue Load = Op0.getOperand(1);
56270 EVT LoadVT = Load.getSimpleValueType();
56271
56272 APInt UndefElts;
56273 SmallVector<APInt, 32> EltBits;
56275 UndefElts, EltBits,
56276 /*AllowWholeUndefs*/ true,
56277 /*AllowPartialUndefs*/ false) ||
56278 UndefElts[0] || !EltBits[0].isPowerOf2() || UndefElts.getBitWidth() > 16)
56279 return SDValue();
56280
56281 // Check if the constant pool contains only powers of 2 starting from some
56282 // 2^N. The table may also contain undefs because of widening of vector
56283 // operands.
56284 unsigned N = EltBits[0].logBase2();
56285 unsigned Len = UndefElts.getBitWidth();
56286 for (unsigned I = 1; I != Len; ++I) {
56287 if (UndefElts[I]) {
56288 if (!UndefElts.extractBits(Len - (I + 1), I + 1).isAllOnes())
56289 return SDValue();
56290 break;
56291 }
56292
56293 if (EltBits[I].getBitWidth() <= N + I || !EltBits[I].isOneBitSet(N + I))
56294 return SDValue();
56295 }
56296
56297 MVT BroadcastOpVT = Broadcast.getSimpleValueType().getVectorElementType();
56298 SDValue BroadcastOp;
56299 if (Broadcast.getOpcode() != X86ISD::VBROADCAST) {
56300 BroadcastOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, BroadcastOpVT,
56301 Broadcast, DAG.getVectorIdxConstant(0, DL));
56302 } else {
56303 BroadcastOp = Broadcast.getOperand(0);
56304 if (BroadcastOp.getValueType().isVector())
56305 return SDValue();
56306 }
56307
56308 SDValue Masked = BroadcastOp;
56309 if (N != 0) {
56310 unsigned BroadcastOpBitWidth = BroadcastOpVT.getSizeInBits();
56311 unsigned NumDefinedElts = UndefElts.countTrailingZeros();
56312
56313 if (NumDefinedElts > BroadcastOpBitWidth)
56314 return SDValue();
56315
56316 APInt Mask = APInt::getLowBitsSet(BroadcastOpBitWidth, NumDefinedElts);
56317 SDValue ShiftedValue = DAG.getNode(ISD::SRL, DL, BroadcastOpVT, BroadcastOp,
56318 DAG.getConstant(N, DL, BroadcastOpVT));
56319 Masked = DAG.getNode(ISD::AND, DL, BroadcastOpVT, ShiftedValue,
56320 DAG.getConstant(Mask, DL, BroadcastOpVT));
56321 }
56322 // We can't extract more than 16 bits using this pattern, because 2^{17} will
56323 // not fit in an i16 and a vXi32 where X > 16 is more than 512 bits.
56324 SDValue Trunc = DAG.getAnyExtOrTrunc(Masked, DL, MVT::i16);
56325 SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, MVT::v16i1, Trunc);
56326
56327 if (CC == ISD::SETEQ)
56328 Bitcast = DAG.getNOT(DL, Bitcast, MVT::v16i1);
56329
56330 if (VT != MVT::v16i1)
56331 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Bitcast,
56332 DAG.getVectorIdxConstant(0, DL));
56333
56334 return Bitcast;
56335}
56336
56339 const X86Subtarget &Subtarget) {
56340 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
56341 const SDValue LHS = N->getOperand(0);
56342 const SDValue RHS = N->getOperand(1);
56343 EVT VT = N->getValueType(0);
56344 EVT OpVT = LHS.getValueType();
56345 SDLoc DL(N);
56346
56347 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56348 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
56349 Subtarget))
56350 return V;
56351 }
56352
56353 if (VT == MVT::i1) {
56354 X86::CondCode X86CC;
56355 if (SDValue V =
56356 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
56357 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
56358 }
56359
56360 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56361 if (OpVT.isScalarInteger()) {
56362 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
56363 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
56364 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
56365 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
56366 if (N0.getOperand(0) == N1)
56367 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56368 N0.getOperand(1));
56369 if (N0.getOperand(1) == N1)
56370 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56371 N0.getOperand(0));
56372 }
56373 return SDValue();
56374 };
56375 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
56376 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56377 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
56378 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56379
56380 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
56381 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
56382 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
56383 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
56384 if (N0.getOperand(0) == N1)
56385 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56386 DAG.getNOT(DL, N0.getOperand(1), OpVT));
56387 if (N0.getOperand(1) == N1)
56388 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56389 DAG.getNOT(DL, N0.getOperand(0), OpVT));
56390 }
56391 return SDValue();
56392 };
56393 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
56394 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56395 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
56396 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56397
56398 // cmpeq(trunc(x),C) --> cmpeq(x,C)
56399 // cmpne(trunc(x),C) --> cmpne(x,C)
56400 // iff x upper bits are zero.
56401 if (LHS.getOpcode() == ISD::TRUNCATE &&
56402 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
56404 EVT SrcVT = LHS.getOperand(0).getValueType();
56406 OpVT.getScalarSizeInBits());
56407 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56408 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
56409 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
56410 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
56411 DAG.getZExtOrTrunc(RHS, DL, SrcVT), CC);
56412 }
56413
56414 // With C as a power of 2 and C != 0 and C != INT_MIN:
56415 // icmp eq Abs(X) C ->
56416 // (icmp eq A, C) | (icmp eq A, -C)
56417 // icmp ne Abs(X) C ->
56418 // (icmp ne A, C) & (icmp ne A, -C)
56419 // Both of these patterns can be better optimized in
56420 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
56421 // integers which is checked above.
56422 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
56423 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
56424 const APInt &CInt = C->getAPIntValue();
56425 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
56426 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
56427 SDValue BaseOp = LHS.getOperand(0);
56428 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
56429 SDValue SETCC1 = DAG.getSetCC(
56430 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
56431 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
56432 SETCC0, SETCC1);
56433 }
56434 }
56435 }
56436 }
56437 }
56438
56439 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
56440 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
56441 // Using temporaries to avoid messing up operand ordering for later
56442 // transformations if this doesn't work.
56443 SDValue Op0 = LHS;
56444 SDValue Op1 = RHS;
56445 ISD::CondCode TmpCC = CC;
56446 // Put build_vector on the right.
56447 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
56448 std::swap(Op0, Op1);
56449 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
56450 }
56451
56452 bool IsSEXT0 =
56453 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
56454 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
56455 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
56456
56457 if (IsSEXT0 && IsVZero1) {
56458 assert(VT == Op0.getOperand(0).getValueType() &&
56459 "Unexpected operand type");
56460 if (TmpCC == ISD::SETGT)
56461 return DAG.getConstant(0, DL, VT);
56462 if (TmpCC == ISD::SETLE)
56463 return DAG.getConstant(1, DL, VT);
56464 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
56465 return DAG.getNOT(DL, Op0.getOperand(0), VT);
56466
56467 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
56468 "Unexpected condition code!");
56469 return Op0.getOperand(0);
56470 }
56471
56472 if (IsVZero1)
56473 if (SDValue V =
56474 combineAVX512SetCCToKMOV(VT, Op0, TmpCC, DL, DAG, Subtarget))
56475 return V;
56476 }
56477
56478 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
56479 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
56480 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
56481 // a mask, there are signed AVX512 comparisons).
56482 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
56483 bool CanMakeSigned = false;
56484 if (ISD::isUnsignedIntSetCC(CC)) {
56485 KnownBits CmpKnown =
56487 // If we know LHS/RHS share the same sign bit at each element we can
56488 // make this signed.
56489 // NOTE: `computeKnownBits` on a vector type aggregates common bits
56490 // across all lanes. So a pattern where the sign varies from lane to
56491 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
56492 // missed. We could get around this by demanding each lane
56493 // independently, but this isn't the most important optimization and
56494 // that may eat into compile time.
56495 CanMakeSigned =
56496 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
56497 }
56498 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
56499 SDValue LHSOut = LHS;
56500 SDValue RHSOut = RHS;
56501 ISD::CondCode NewCC = CC;
56502 switch (CC) {
56503 case ISD::SETGE:
56504 case ISD::SETUGE:
56505 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
56506 /*NSW*/ true))
56507 LHSOut = NewLHS;
56508 else if (SDValue NewRHS = incDecVectorConstant(
56509 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
56510 RHSOut = NewRHS;
56511 else
56512 break;
56513
56514 [[fallthrough]];
56515 case ISD::SETUGT:
56516 NewCC = ISD::SETGT;
56517 break;
56518
56519 case ISD::SETLE:
56520 case ISD::SETULE:
56521 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
56522 /*NSW*/ true))
56523 LHSOut = NewLHS;
56524 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
56525 /*NSW*/ true))
56526 RHSOut = NewRHS;
56527 else
56528 break;
56529
56530 [[fallthrough]];
56531 case ISD::SETULT:
56532 // Will be swapped to SETGT in LowerVSETCC*.
56533 NewCC = ISD::SETLT;
56534 break;
56535 default:
56536 break;
56537 }
56538 if (NewCC != CC) {
56539 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
56540 NewCC, DL, DAG, Subtarget))
56541 return R;
56542 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
56543 }
56544 }
56545 }
56546
56547 if (SDValue R =
56548 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
56549 return R;
56550
56551 // In the middle end transforms:
56552 // `(or (icmp eq X, C), (icmp eq X, C+1))`
56553 // -> `(icmp ult (add x, -C), 2)`
56554 // Likewise inverted cases with `ugt`.
56555 //
56556 // Since x86, pre avx512, doesn't have unsigned vector compares, this results
56557 // in worse codegen. So, undo the middle-end transform and go back to `(or
56558 // (icmp eq), (icmp eq))` form.
56559 // Also skip AVX1 with ymm vectors, as the umin approach combines better than
56560 // the xmm approach.
56561 //
56562 // NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp
56563 // ne))` as it doesn't end up instruction positive.
56564 // TODO: We might want to do this for avx512 as well if we `sext` the result.
56565 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() &&
56566 ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD &&
56567 !Subtarget.hasAVX512() &&
56568 (OpVT.getSizeInBits() <= 128 || !Subtarget.hasAVX() ||
56569 Subtarget.hasAVX2()) &&
56570 LHS.hasOneUse()) {
56571
56572 APInt CmpC;
56573 SDValue AddC = LHS.getOperand(1);
56574 if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) &&
56576 // See which form we have depending on the constant/condition.
56577 SDValue C0 = SDValue();
56578 SDValue C1 = SDValue();
56579
56580 // If we had `(add x, -1)` and can lower with `umin`, don't transform as
56581 // we will end up generating an additional constant. Keeping in the
56582 // current form has a slight latency cost, but it probably worth saving a
56583 // constant.
56586 // Pass
56587 }
56588 // Normal Cases
56589 else if ((CC == ISD::SETULT && CmpC == 2) ||
56590 (CC == ISD::SETULE && CmpC == 1)) {
56591 // These will constant fold.
56592 C0 = DAG.getNegative(AddC, DL, OpVT);
56593 C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0,
56594 DAG.getAllOnesConstant(DL, OpVT));
56595 }
56596 // Inverted Cases
56597 else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
56598 (CC == ISD::SETUGE && (-CmpC) == 2)) {
56599 // These will constant fold.
56600 C0 = DAG.getNOT(DL, AddC, OpVT);
56601 C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0,
56602 DAG.getAllOnesConstant(DL, OpVT));
56603 }
56604 if (C0 && C1) {
56605 SDValue NewLHS =
56606 DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ);
56607 SDValue NewRHS =
56608 DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ);
56609 return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS);
56610 }
56611 }
56612 }
56613
56614 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
56615 // to avoid scalarization via legalization because v4i32 is not a legal type.
56616 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
56617 LHS.getValueType() == MVT::v4f32)
56618 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
56619
56620 // X pred 0.0 --> X pred -X
56621 // If the negation of X already exists, use it in the comparison. This removes
56622 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
56623 // instructions in patterns with a 'select' node.
56625 SDVTList FNegVT = DAG.getVTList(OpVT);
56626 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
56627 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
56628 }
56629
56630 return SDValue();
56631}
56632
56635 const X86Subtarget &Subtarget) {
56636 SDValue Src = N->getOperand(0);
56637 MVT SrcVT = Src.getSimpleValueType();
56638 MVT VT = N->getSimpleValueType(0);
56639 unsigned NumBits = VT.getScalarSizeInBits();
56640 unsigned NumElts = SrcVT.getVectorNumElements();
56641 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
56642 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
56643
56644 // Perform constant folding.
56645 APInt UndefElts;
56646 SmallVector<APInt, 32> EltBits;
56647 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits,
56648 /*AllowWholeUndefs*/ true,
56649 /*AllowPartialUndefs*/ true)) {
56650 APInt Imm(32, 0);
56651 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
56652 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56653 Imm.setBit(Idx);
56654
56655 return DAG.getConstant(Imm, SDLoc(N), VT);
56656 }
56657
56658 // Look through int->fp bitcasts that don't change the element width.
56659 unsigned EltWidth = SrcVT.getScalarSizeInBits();
56660 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
56661 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
56662 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
56663
56664 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
56665 // with scalar comparisons.
56666 if (SDValue NotSrc = IsNOT(Src, DAG)) {
56667 SDLoc DL(N);
56668 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56669 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
56670 return DAG.getNode(ISD::XOR, DL, VT,
56671 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
56672 DAG.getConstant(NotMask, DL, VT));
56673 }
56674
56675 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
56676 // results with scalar comparisons.
56677 if (Src.getOpcode() == X86ISD::PCMPGT &&
56678 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
56679 SDLoc DL(N);
56680 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56681 return DAG.getNode(ISD::XOR, DL, VT,
56682 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
56683 DAG.getConstant(NotMask, DL, VT));
56684 }
56685
56686 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
56687 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
56688 // iff pow2splat(c1).
56689 // Use KnownBits to determine if only a single bit is non-zero
56690 // in each element (pow2 or zero), and shift that bit to the msb.
56691 if (Src.getOpcode() == X86ISD::PCMPEQ) {
56692 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
56693 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
56694 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
56695 if (KnownLHS.countMaxPopulation() == 1 &&
56696 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
56697 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
56698 SDLoc DL(N);
56699 MVT ShiftVT = SrcVT;
56700 SDValue ShiftLHS = Src.getOperand(0);
56701 SDValue ShiftRHS = Src.getOperand(1);
56702 if (ShiftVT.getScalarType() == MVT::i8) {
56703 // vXi8 shifts - we only care about the signbit so can use PSLLW.
56704 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
56705 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
56706 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
56707 }
56708 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56709 ShiftLHS, ShiftAmt, DAG);
56710 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56711 ShiftRHS, ShiftAmt, DAG);
56712 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
56713 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
56714 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
56715 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
56716 }
56717 }
56718
56719 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
56720 if (N->isOnlyUserOf(Src.getNode())) {
56722 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
56723 APInt UndefElts;
56724 SmallVector<APInt, 32> EltBits;
56725 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
56726 UndefElts, EltBits)) {
56727 APInt Mask = APInt::getZero(NumBits);
56728 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
56729 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56730 Mask.setBit(Idx);
56731 }
56732 SDLoc DL(N);
56733 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
56734 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
56735 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
56736 DAG.getConstant(Mask, DL, VT));
56737 }
56738 }
56739 }
56740
56741 // Simplify the inputs.
56742 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56743 APInt DemandedMask(APInt::getAllOnes(NumBits));
56744 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56745 return SDValue(N, 0);
56746
56747 return SDValue();
56748}
56749
56752 const X86Subtarget &Subtarget) {
56753 MVT VT = N->getSimpleValueType(0);
56754 unsigned NumBits = VT.getScalarSizeInBits();
56755
56756 // Simplify the inputs.
56757 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56758 APInt DemandedMask(APInt::getAllOnes(NumBits));
56759 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56760 return SDValue(N, 0);
56761
56762 return SDValue();
56763}
56764
56768 SDValue Mask = MemOp->getMask();
56769
56770 // With vector masks we only demand the upper bit of the mask.
56771 if (Mask.getScalarValueSizeInBits() != 1) {
56772 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56773 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
56774 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
56775 if (N->getOpcode() != ISD::DELETED_NODE)
56776 DCI.AddToWorklist(N);
56777 return SDValue(N, 0);
56778 }
56779 }
56780
56781 return SDValue();
56782}
56783
56785 SDValue Index, SDValue Base, SDValue Scale,
56786 SelectionDAG &DAG) {
56787 SDLoc DL(GorS);
56788
56789 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
56790 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
56791 Gather->getMask(), Base, Index, Scale } ;
56792 return DAG.getMaskedGather(Gather->getVTList(),
56793 Gather->getMemoryVT(), DL, Ops,
56794 Gather->getMemOperand(),
56795 Gather->getIndexType(),
56796 Gather->getExtensionType());
56797 }
56798 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
56799 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
56800 Scatter->getMask(), Base, Index, Scale };
56801 return DAG.getMaskedScatter(Scatter->getVTList(),
56802 Scatter->getMemoryVT(), DL,
56803 Ops, Scatter->getMemOperand(),
56804 Scatter->getIndexType(),
56805 Scatter->isTruncatingStore());
56806}
56807
56810 SDLoc DL(N);
56811 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
56812 SDValue Index = GorS->getIndex();
56813 SDValue Base = GorS->getBasePtr();
56814 SDValue Scale = GorS->getScale();
56815 EVT IndexVT = Index.getValueType();
56816 EVT IndexSVT = IndexVT.getVectorElementType();
56817 unsigned IndexWidth = Index.getScalarValueSizeInBits();
56818 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56819 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
56820
56821 if (DCI.isBeforeLegalize()) {
56822 // Attempt to move shifted index into the address scale, allows further
56823 // index truncation below.
56824 if (Index.getOpcode() == ISD::SHL && IndexSVT == PtrVT &&
56825 isa<ConstantSDNode>(Scale)) {
56826 unsigned ScaleAmt = Scale->getAsZExtVal();
56827 assert(isPowerOf2_32(ScaleAmt) && "Scale must be a power of 2");
56828 unsigned Log2ScaleAmt = Log2_32(ScaleAmt);
56829 unsigned MaskBits = IndexWidth - Log2ScaleAmt;
56830 APInt DemandedBits = APInt::getLowBitsSet(IndexWidth, MaskBits);
56831 if (TLI.SimplifyDemandedBits(Index, DemandedBits, DCI)) {
56832 if (N->getOpcode() != ISD::DELETED_NODE)
56833 DCI.AddToWorklist(N);
56834 return SDValue(N, 0);
56835 }
56836 if (auto MinShAmt = DAG.getValidMinimumShiftAmount(Index)) {
56837 if (*MinShAmt >= 1 && Log2ScaleAmt < 3 &&
56838 DAG.ComputeNumSignBits(Index.getOperand(0)) > 1) {
56839 SDValue ShAmt = Index.getOperand(1);
56840 SDValue NewShAmt =
56841 DAG.getNode(ISD::SUB, DL, ShAmt.getValueType(), ShAmt,
56842 DAG.getConstant(1, DL, ShAmt.getValueType()));
56843 SDValue NewIndex = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
56844 Index.getOperand(0), NewShAmt);
56845 SDValue NewScale =
56846 DAG.getConstant(ScaleAmt * 2, DL, Scale.getValueType());
56847 return rebuildGatherScatter(GorS, NewIndex, Base, NewScale, DAG);
56848 }
56849 }
56850 }
56851
56852 // Shrink indices if they are larger than 32-bits.
56853 // Only do this before legalize types since v2i64 could become v2i32.
56854 // FIXME: We could check that the type is legal if we're after legalize
56855 // types, but then we would need to construct test cases where that happens.
56856 if (IndexWidth > 32 && DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
56857 EVT NewVT = IndexVT.changeVectorElementType(MVT::i32);
56858
56859 // FIXME: We could support more than just constant fold, but we need to
56860 // careful with costing. A truncate that can be optimized out would be
56861 // fine. Otherwise we might only want to create a truncate if it avoids
56862 // a split.
56863 if (SDValue TruncIndex =
56864 DAG.FoldConstantArithmetic(ISD::TRUNCATE, DL, NewVT, Index))
56865 return rebuildGatherScatter(GorS, TruncIndex, Base, Scale, DAG);
56866
56867 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
56868 // there are sufficient sign bits. Only do this before legalize types to
56869 // avoid creating illegal types in truncate.
56870 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
56871 Index.getOpcode() == ISD::ZERO_EXTEND) &&
56872 Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
56873 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56874 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56875 }
56876
56877 // Shrink if we remove an illegal type.
56878 if (!TLI.isTypeLegal(Index.getValueType()) && TLI.isTypeLegal(NewVT)) {
56879 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56880 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56881 }
56882 }
56883 }
56884
56885 // Try to move splat adders from the index operand to the base
56886 // pointer operand. Taking care to multiply by the scale. We can only do
56887 // this when index element type is the same as the pointer type.
56888 // Otherwise we need to be sure the math doesn't wrap before the scale.
56889 if (Index.getOpcode() == ISD::ADD && IndexSVT == PtrVT &&
56890 isa<ConstantSDNode>(Scale)) {
56891 uint64_t ScaleAmt = Scale->getAsZExtVal();
56892
56893 for (unsigned I = 0; I != 2; ++I)
56894 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(I))) {
56895 BitVector UndefElts;
56896 if (SDValue Splat = BV->getSplatValue(&UndefElts)) {
56897 if (UndefElts.none()) {
56898 // If the splat value is constant we can add the scaled splat value
56899 // to the existing base.
56900 if (auto *C = dyn_cast<ConstantSDNode>(Splat)) {
56901 APInt Adder = C->getAPIntValue() * ScaleAmt;
56902 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
56903 DAG.getConstant(Adder, DL, PtrVT));
56904 SDValue NewIndex = Index.getOperand(1 - I);
56905 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56906 }
56907 // For non-constant cases, limit this to non-scaled cases.
56908 if (ScaleAmt == 1) {
56909 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base, Splat);
56910 SDValue NewIndex = Index.getOperand(1 - I);
56911 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56912 }
56913 }
56914 }
56915 // It's also possible base is just a constant. In that case, just
56916 // replace it with 0 and move the displacement into the index.
56917 if (ScaleAmt == 1 && BV->isConstant() && isa<ConstantSDNode>(Base)) {
56918 SDValue Splat = DAG.getSplatBuildVector(IndexVT, DL, Base);
56919 // Combine the constant build_vector and the constant base.
56920 Splat =
56921 DAG.getNode(ISD::ADD, DL, IndexVT, Index.getOperand(I), Splat);
56922 // Add to the other half of the original Index add.
56923 SDValue NewIndex = DAG.getNode(ISD::ADD, DL, IndexVT,
56924 Index.getOperand(1 - I), Splat);
56925 SDValue NewBase = DAG.getConstant(0, DL, PtrVT);
56926 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56927 }
56928 }
56929 }
56930
56931 if (DCI.isBeforeLegalizeOps()) {
56932 // Make sure the index is either i32 or i64
56933 if (IndexWidth != 32 && IndexWidth != 64) {
56934 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
56935 IndexVT = IndexVT.changeVectorElementType(EltVT);
56936 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
56937 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56938 }
56939 }
56940
56941 // With vector masks we only demand the upper bit of the mask.
56942 SDValue Mask = GorS->getMask();
56943 if (Mask.getScalarValueSizeInBits() != 1) {
56944 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
56945 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
56946 if (N->getOpcode() != ISD::DELETED_NODE)
56947 DCI.AddToWorklist(N);
56948 return SDValue(N, 0);
56949 }
56950 }
56951
56952 return SDValue();
56953}
56954
56955// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
56957 const X86Subtarget &Subtarget) {
56958 SDLoc DL(N);
56959 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
56960 SDValue EFLAGS = N->getOperand(1);
56961
56962 // Try to simplify the EFLAGS and condition code operands.
56963 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
56964 return getSETCC(CC, Flags, DL, DAG);
56965
56966 return SDValue();
56967}
56968
56969/// Optimize branch condition evaluation.
56971 const X86Subtarget &Subtarget) {
56972 SDLoc DL(N);
56973 SDValue EFLAGS = N->getOperand(3);
56974 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
56975
56976 // Try to simplify the EFLAGS and condition code operands.
56977 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
56978 // RAUW them under us.
56979 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
56980 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
56981 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
56982 N->getOperand(1), Cond, Flags);
56983 }
56984
56985 return SDValue();
56986}
56987
56988// TODO: Could we move this to DAGCombine?
56990 SelectionDAG &DAG) {
56991 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
56992 // to optimize away operation when it's from a constant.
56993 //
56994 // The general transformation is:
56995 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
56996 // AND(VECTOR_CMP(x,y), constant2)
56997 // constant2 = UNARYOP(constant)
56998
56999 // Early exit if this isn't a vector operation, the operand of the
57000 // unary operation isn't a bitwise AND, or if the sizes of the operations
57001 // aren't the same.
57002 EVT VT = N->getValueType(0);
57003 bool IsStrict = N->isStrictFPOpcode();
57004 unsigned NumEltBits = VT.getScalarSizeInBits();
57005 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57006 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
57007 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
57008 VT.getSizeInBits() != Op0.getValueSizeInBits())
57009 return SDValue();
57010
57011 // Now check that the other operand of the AND is a constant. We could
57012 // make the transformation for non-constant splats as well, but it's unclear
57013 // that would be a benefit as it would not eliminate any operations, just
57014 // perform one more step in scalar code before moving to the vector unit.
57015 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
57016 // Bail out if the vector isn't a constant.
57017 if (!BV->isConstant())
57018 return SDValue();
57019
57020 // Everything checks out. Build up the new and improved node.
57021 SDLoc DL(N);
57022 EVT IntVT = BV->getValueType(0);
57023 // Create a new constant of the appropriate type for the transformed
57024 // DAG.
57025 SDValue SourceConst;
57026 if (IsStrict)
57027 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
57028 {N->getOperand(0), SDValue(BV, 0)});
57029 else
57030 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
57031 // The AND node needs bitcasts to/from an integer vector type around it.
57032 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
57033 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
57034 MaskConst);
57035 SDValue Res = DAG.getBitcast(VT, NewAnd);
57036 if (IsStrict)
57037 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
57038 return Res;
57039 }
57040
57041 return SDValue();
57042}
57043
57044/// If we are converting a value to floating-point, try to replace scalar
57045/// truncate of an extracted vector element with a bitcast. This tries to keep
57046/// the sequence on XMM registers rather than moving between vector and GPRs.
57048 // TODO: This is currently only used by combineSIntToFP, but it is generalized
57049 // to allow being called by any similar cast opcode.
57050 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
57051 SDValue Trunc = N->getOperand(0);
57052 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
57053 return SDValue();
57054
57055 SDValue ExtElt = Trunc.getOperand(0);
57056 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
57057 !isNullConstant(ExtElt.getOperand(1)))
57058 return SDValue();
57059
57060 EVT TruncVT = Trunc.getValueType();
57061 EVT SrcVT = ExtElt.getValueType();
57062 unsigned DestWidth = TruncVT.getSizeInBits();
57063 unsigned SrcWidth = SrcVT.getSizeInBits();
57064 if (SrcWidth % DestWidth != 0)
57065 return SDValue();
57066
57067 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
57068 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
57069 unsigned VecWidth = SrcVecVT.getSizeInBits();
57070 unsigned NumElts = VecWidth / DestWidth;
57071 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
57072 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
57073 SDLoc DL(N);
57074 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
57075 BitcastVec, ExtElt.getOperand(1));
57076 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
57077}
57078
57080 const X86Subtarget &Subtarget) {
57081 bool IsStrict = N->isStrictFPOpcode();
57082 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57083 EVT VT = N->getValueType(0);
57084 EVT InVT = Op0.getValueType();
57085
57086 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57087 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57088 // if hasFP16 support:
57089 // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16))
57090 // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
57091 // else
57092 // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57093 // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
57094 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57095 unsigned ScalarSize = InVT.getScalarSizeInBits();
57096 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57097 ScalarSize >= 64)
57098 return SDValue();
57099 SDLoc dl(N);
57100 EVT DstVT =
57102 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57103 : ScalarSize < 32 ? MVT::i32
57104 : MVT::i64,
57105 InVT.getVectorNumElements());
57106 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57107 if (IsStrict)
57108 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57109 {N->getOperand(0), P});
57110 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57111 }
57112
57113 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
57114 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
57115 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
57116 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57117 VT.getScalarType() != MVT::f16) {
57118 SDLoc dl(N);
57119 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57120 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57121
57122 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
57123 if (IsStrict)
57124 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57125 {N->getOperand(0), P});
57126 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57127 }
57128
57129 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
57130 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
57131 // the optimization here.
57132 SDNodeFlags Flags = N->getFlags();
57133 if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) {
57134 if (IsStrict)
57135 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
57136 {N->getOperand(0), Op0});
57137 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
57138 }
57139
57140 return SDValue();
57141}
57142
57145 const X86Subtarget &Subtarget) {
57146 // First try to optimize away the conversion entirely when it's
57147 // conditionally from a constant. Vectors only.
57148 bool IsStrict = N->isStrictFPOpcode();
57150 return Res;
57151
57152 // Now move on to more general possibilities.
57153 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57154 EVT VT = N->getValueType(0);
57155 EVT InVT = Op0.getValueType();
57156
57157 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57158 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57159 // if hasFP16 support:
57160 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
57161 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
57162 // else
57163 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57164 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
57165 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57166 unsigned ScalarSize = InVT.getScalarSizeInBits();
57167 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57168 ScalarSize >= 64)
57169 return SDValue();
57170 SDLoc dl(N);
57171 EVT DstVT =
57173 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57174 : ScalarSize < 32 ? MVT::i32
57175 : MVT::i64,
57176 InVT.getVectorNumElements());
57177 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57178 if (IsStrict)
57179 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57180 {N->getOperand(0), P});
57181 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57182 }
57183
57184 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
57185 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
57186 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
57187 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57188 VT.getScalarType() != MVT::f16) {
57189 SDLoc dl(N);
57190 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57191 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57192 if (IsStrict)
57193 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57194 {N->getOperand(0), P});
57195 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57196 }
57197
57198 // Without AVX512DQ we only support i64 to float scalar conversion. For both
57199 // vectors and scalars, see if we know that the upper bits are all the sign
57200 // bit, in which case we can truncate the input to i32 and convert from that.
57201 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
57202 unsigned BitWidth = InVT.getScalarSizeInBits();
57203 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
57204 if (NumSignBits >= (BitWidth - 31)) {
57205 EVT TruncVT = MVT::i32;
57206 if (InVT.isVector())
57207 TruncVT = InVT.changeVectorElementType(TruncVT);
57208 SDLoc dl(N);
57209 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
57210 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
57211 if (IsStrict)
57212 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57213 {N->getOperand(0), Trunc});
57214 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
57215 }
57216 // If we're after legalize and the type is v2i32 we need to shuffle and
57217 // use CVTSI2P.
57218 assert(InVT == MVT::v2i64 && "Unexpected VT!");
57219 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
57220 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
57221 { 0, 2, -1, -1 });
57222 if (IsStrict)
57223 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
57224 {N->getOperand(0), Shuf});
57225 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
57226 }
57227 }
57228
57229 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
57230 // a 32-bit target where SSE doesn't support i64->FP operations.
57231 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
57232 Op0.getOpcode() == ISD::LOAD) {
57233 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
57234
57235 // This transformation is not supported if the result type is f16 or f128.
57236 if (VT == MVT::f16 || VT == MVT::f128)
57237 return SDValue();
57238
57239 // If we have AVX512DQ we can use packed conversion instructions unless
57240 // the VT is f80.
57241 if (Subtarget.hasDQI() && VT != MVT::f80)
57242 return SDValue();
57243
57244 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
57245 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
57246 std::pair<SDValue, SDValue> Tmp =
57247 Subtarget.getTargetLowering()->BuildFILD(
57248 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
57249 Ld->getPointerInfo(), Ld->getBaseAlign(), DAG);
57250 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
57251 return Tmp.first;
57252 }
57253 }
57254
57255 if (IsStrict)
57256 return SDValue();
57257
57258 if (SDValue V = combineToFPTruncExtElt(N, DAG))
57259 return V;
57260
57261 return SDValue();
57262}
57263
57265 const X86Subtarget &Subtarget) {
57266 EVT VT = N->getValueType(0);
57267 SDValue Src = N->getOperand(0);
57268 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::FRINT &&
57269 VT.getScalarType() == MVT::i32 && Src.hasOneUse())
57270 return DAG.getNode(ISD::LRINT, SDLoc(N), VT, Src.getOperand(0));
57271
57272 return SDValue();
57273}
57274
57275// Custom handling for VCVTTPS2QQS/VCVTTPS2UQQS
57277 const X86Subtarget &Subtarget) {
57278 if (!Subtarget.hasAVX10_2())
57279 return SDValue();
57280
57281 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT_SAT;
57282 EVT SrcVT = N->getOperand(0).getValueType();
57283 EVT DstVT = N->getValueType(0);
57284 SDLoc dl(N);
57285
57286 if (SrcVT == MVT::v2f32 && DstVT == MVT::v2i64) {
57287 SDValue V2F32Value = DAG.getUNDEF(SrcVT);
57288
57289 // Concatenate the original v2f32 input and V2F32Value to create v4f32
57290 SDValue NewSrc = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
57291 N->getOperand(0), V2F32Value);
57292
57293 // Select the FP_TO_SINT_SAT/FP_TO_UINT_SAT node
57294 if (IsSigned)
57295 return DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v2i64, NewSrc);
57296
57297 return DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v2i64, NewSrc);
57298 }
57299 return SDValue();
57300}
57301
57303 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57304
57305 for (const SDNode *User : Flags->users()) {
57306 X86::CondCode CC;
57307 switch (User->getOpcode()) {
57308 default:
57309 // Be conservative.
57310 return true;
57311 case X86ISD::SETCC:
57313 CC = (X86::CondCode)User->getConstantOperandVal(0);
57314 break;
57315 case X86ISD::BRCOND:
57316 case X86ISD::CMOV:
57317 CC = (X86::CondCode)User->getConstantOperandVal(2);
57318 break;
57319 }
57320
57321 switch (CC) {
57322 // clang-format off
57323 default: break;
57324 case X86::COND_A: case X86::COND_AE:
57325 case X86::COND_B: case X86::COND_BE:
57326 case X86::COND_O: case X86::COND_NO:
57327 case X86::COND_G: case X86::COND_GE:
57328 case X86::COND_L: case X86::COND_LE:
57329 return true;
57330 // clang-format on
57331 }
57332 }
57333
57334 return false;
57335}
57336
57337static bool onlyZeroFlagUsed(SDValue Flags) {
57338 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57339
57340 for (const SDNode *User : Flags->users()) {
57341 unsigned CCOpNo;
57342 switch (User->getOpcode()) {
57343 default:
57344 // Be conservative.
57345 return false;
57346 case X86ISD::SETCC:
57348 CCOpNo = 0;
57349 break;
57350 case X86ISD::BRCOND:
57351 case X86ISD::CMOV:
57352 CCOpNo = 2;
57353 break;
57354 }
57355
57356 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
57357 if (CC != X86::COND_E && CC != X86::COND_NE)
57358 return false;
57359 }
57360
57361 return true;
57362}
57363
57366 const X86Subtarget &Subtarget) {
57367 // Only handle test patterns.
57368 if (!isNullConstant(N->getOperand(1)))
57369 return SDValue();
57370
57371 // If we have a CMP of a truncated binop, see if we can make a smaller binop
57372 // and use its flags directly.
57373 // TODO: Maybe we should try promoting compares that only use the zero flag
57374 // first if we can prove the upper bits with computeKnownBits?
57375 SDLoc dl(N);
57376 SDValue Op = N->getOperand(0);
57377 EVT VT = Op.getValueType();
57378 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57379
57380 if (SDValue CMP =
57381 combineX86SubCmpForFlags(N, SDValue(N, 0), DAG, DCI, Subtarget))
57382 return CMP;
57383
57384 // If we have a constant logical shift that's only used in a comparison
57385 // against zero turn it into an equivalent AND. This allows turning it into
57386 // a TEST instruction later.
57387 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
57388 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
57389 onlyZeroFlagUsed(SDValue(N, 0))) {
57390 unsigned BitWidth = VT.getSizeInBits();
57391 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
57392 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
57393 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
57394 APInt Mask = Op.getOpcode() == ISD::SRL
57395 ? APInt::getHighBitsSet(BitWidth, MaskBits)
57396 : APInt::getLowBitsSet(BitWidth, MaskBits);
57397 if (Mask.isSignedIntN(32)) {
57398 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
57399 DAG.getConstant(Mask, dl, VT));
57400 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57401 DAG.getConstant(0, dl, VT));
57402 }
57403 }
57404 }
57405
57406 // If we're extracting from a avx512 bool vector and comparing against zero,
57407 // then try to just bitcast the vector to an integer to use TEST/BT directly.
57408 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
57409 if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
57410 Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
57411 SDValue Src = Op.getOperand(0);
57412 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
57413 isNullConstant(Src.getOperand(1)) &&
57414 Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
57415 SDValue BoolVec = Src.getOperand(0);
57416 unsigned ShAmt = 0;
57417 if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {
57418 ShAmt = BoolVec.getConstantOperandVal(1);
57419 BoolVec = BoolVec.getOperand(0);
57420 }
57421 BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);
57422 EVT VecVT = BoolVec.getValueType();
57423 unsigned BitWidth = VecVT.getVectorNumElements();
57424 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
57425 if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
57426 APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
57427 Op = DAG.getBitcast(BCVT, BoolVec);
57428 Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
57429 DAG.getConstant(Mask, dl, BCVT));
57430 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57431 DAG.getConstant(0, dl, BCVT));
57432 }
57433 }
57434 }
57435
57436 // Peek through any zero-extend if we're only testing for a zero result.
57437 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
57438 SDValue Src = Op.getOperand(0);
57439 EVT SrcVT = Src.getValueType();
57440 if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
57441 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
57442 DAG.getConstant(0, dl, SrcVT));
57443 }
57444
57445 // Look for a truncate.
57446 if (Op.getOpcode() != ISD::TRUNCATE)
57447 return SDValue();
57448
57449 SDValue Trunc = Op;
57450 Op = Op.getOperand(0);
57451
57452 // See if we can compare with zero against the truncation source,
57453 // which should help using the Z flag from many ops. Only do this for
57454 // i32 truncated op to prevent partial-reg compares of promoted ops.
57455 EVT OpVT = Op.getValueType();
57456 APInt UpperBits =
57458 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
57459 onlyZeroFlagUsed(SDValue(N, 0))) {
57460 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57461 DAG.getConstant(0, dl, OpVT));
57462 }
57463
57464 // After this the truncate and arithmetic op must have a single use.
57465 if (!Trunc.hasOneUse() || !Op.hasOneUse())
57466 return SDValue();
57467
57468 unsigned NewOpc;
57469 switch (Op.getOpcode()) {
57470 default: return SDValue();
57471 case ISD::AND:
57472 // Skip and with constant. We have special handling for and with immediate
57473 // during isel to generate test instructions.
57474 if (isa<ConstantSDNode>(Op.getOperand(1)))
57475 return SDValue();
57476 NewOpc = X86ISD::AND;
57477 break;
57478 case ISD::OR: NewOpc = X86ISD::OR; break;
57479 case ISD::XOR: NewOpc = X86ISD::XOR; break;
57480 case ISD::ADD:
57481 // If the carry or overflow flag is used, we can't truncate.
57483 return SDValue();
57484 NewOpc = X86ISD::ADD;
57485 break;
57486 case ISD::SUB:
57487 // If the carry or overflow flag is used, we can't truncate.
57489 return SDValue();
57490 NewOpc = X86ISD::SUB;
57491 break;
57492 }
57493
57494 // We found an op we can narrow. Truncate its inputs.
57495 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
57496 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
57497
57498 // Use a X86 specific opcode to avoid DAG combine messing with it.
57499 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57500 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
57501
57502 // For AND, keep a CMP so that we can match the test pattern.
57503 if (NewOpc == X86ISD::AND)
57504 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57505 DAG.getConstant(0, dl, VT));
57506
57507 // Return the flags.
57508 return Op.getValue(1);
57509}
57510
57513 const X86Subtarget &ST) {
57514 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
57515 "Expected X86ISD::ADD or X86ISD::SUB");
57516
57517 SDLoc DL(N);
57518 SDValue LHS = N->getOperand(0);
57519 SDValue RHS = N->getOperand(1);
57520 MVT VT = LHS.getSimpleValueType();
57521 bool IsSub = X86ISD::SUB == N->getOpcode();
57522 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
57523
57524 if (IsSub && isOneConstant(RHS) && !N->hasAnyUseOfValue(0))
57525 if (SDValue CMP = combineX86SubCmpForFlags(N, SDValue(N, 1), DAG, DCI, ST))
57526 return CMP;
57527
57528 // If we don't use the flag result, simplify back to a generic ADD/SUB.
57529 if (!N->hasAnyUseOfValue(1)) {
57530 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
57531 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
57532 }
57533
57534 // Fold any similar generic ADD/SUB opcodes to reuse this node.
57535 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
57536 SDValue Ops[] = {N0, N1};
57537 SDVTList VTs = DAG.getVTList(N->getValueType(0));
57538 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
57539 SDValue Op(N, 0);
57540 if (Negate) {
57541 // Bail if this is only used by a user of the x86 add/sub.
57542 if (GenericAddSub->hasOneUse() &&
57543 GenericAddSub->user_begin()->isOnlyUserOf(N))
57544 return;
57545 Op = DAG.getNegative(Op, DL, VT);
57546 }
57547 DCI.CombineTo(GenericAddSub, Op);
57548 }
57549 };
57550 MatchGeneric(LHS, RHS, false);
57551 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
57552
57553 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
57554 // EFLAGS result doesn't change.
57555 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
57556 /*ZeroSecondOpOnly*/ true);
57557}
57558
57560 SDValue LHS = N->getOperand(0);
57561 SDValue RHS = N->getOperand(1);
57562 SDValue BorrowIn = N->getOperand(2);
57563
57564 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
57565 MVT VT = N->getSimpleValueType(0);
57566 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57567 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
57568 }
57569
57570 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
57571 // iff the flag result is dead.
57572 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
57573 !N->hasAnyUseOfValue(1))
57574 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57575 LHS.getOperand(1), BorrowIn);
57576
57577 return SDValue();
57578}
57579
57580// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
57583 SDValue LHS = N->getOperand(0);
57584 SDValue RHS = N->getOperand(1);
57585 SDValue CarryIn = N->getOperand(2);
57586 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
57587 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
57588
57589 // Canonicalize constant to RHS.
57590 if (LHSC && !RHSC)
57591 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
57592 CarryIn);
57593
57594 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
57595 // the result is either zero or one (depending on the input carry bit).
57596 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
57597 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
57598 // We don't have a good way to replace an EFLAGS use, so only do this when
57599 // dead right now.
57600 SDValue(N, 1).use_empty()) {
57601 SDLoc DL(N);
57602 EVT VT = N->getValueType(0);
57603 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
57604 SDValue Res1 = DAG.getNode(
57605 ISD::AND, DL, VT,
57607 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
57608 DAG.getConstant(1, DL, VT));
57609 return DCI.CombineTo(N, Res1, CarryOut);
57610 }
57611
57612 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
57613 // iff the flag result is dead.
57614 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
57615 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
57616 SDLoc DL(N);
57617 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
57618 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
57619 DAG.getConstant(0, DL, LHS.getValueType()),
57620 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
57621 }
57622
57623 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
57624 MVT VT = N->getSimpleValueType(0);
57625 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57626 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
57627 }
57628
57629 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
57630 // iff the flag result is dead.
57631 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
57632 !N->hasAnyUseOfValue(1))
57633 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57634 LHS.getOperand(1), CarryIn);
57635
57636 return SDValue();
57637}
57638
57640 const SDLoc &DL, EVT VT,
57641 const X86Subtarget &Subtarget) {
57642 using namespace SDPatternMatch;
57643
57644 // Example of pattern we try to detect:
57645 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
57646 //(add (build_vector (extract_elt t, 0),
57647 // (extract_elt t, 2),
57648 // (extract_elt t, 4),
57649 // (extract_elt t, 6)),
57650 // (build_vector (extract_elt t, 1),
57651 // (extract_elt t, 3),
57652 // (extract_elt t, 5),
57653 // (extract_elt t, 7)))
57654
57655 if (!Subtarget.hasSSE2())
57656 return SDValue();
57657
57658 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57659 VT.getVectorNumElements() < 4 ||
57661 return SDValue();
57662
57663 SDValue Op0, Op1, Accum;
57668 m_Value(Op1))))))
57669 return SDValue();
57670
57671 // Check if one of Op0,Op1 is of the form:
57672 // (build_vector (extract_elt Mul, 0),
57673 // (extract_elt Mul, 2),
57674 // (extract_elt Mul, 4),
57675 // ...
57676 // the other is of the form:
57677 // (build_vector (extract_elt Mul, 1),
57678 // (extract_elt Mul, 3),
57679 // (extract_elt Mul, 5),
57680 // ...
57681 // and identify Mul.
57682 SDValue Mul;
57683 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
57684 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
57685 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
57686 // TODO: Be more tolerant to undefs.
57687 APInt Idx0L, Idx0H, Idx1L, Idx1H;
57688 SDValue Vec0L, Vec0H, Vec1L, Vec1H;
57689 if (!sd_match(Op0L, m_ExtractElt(m_Value(Vec0L), m_ConstInt(Idx0L))) ||
57690 !sd_match(Op0H, m_ExtractElt(m_Value(Vec0H), m_ConstInt(Idx0H))) ||
57691 !sd_match(Op1L, m_ExtractElt(m_Value(Vec1L), m_ConstInt(Idx1L))) ||
57692 !sd_match(Op1H, m_ExtractElt(m_Value(Vec1H), m_ConstInt(Idx1H))))
57693 return SDValue();
57694 // Commutativity of mul allows factors of a product to reorder.
57695 if (Idx0L.getZExtValue() > Idx1L.getZExtValue())
57696 std::swap(Idx0L, Idx1L);
57697 if (Idx0H.getZExtValue() > Idx1H.getZExtValue())
57698 std::swap(Idx0H, Idx1H);
57699 // Commutativity of add allows pairs of factors to reorder.
57700 if (Idx0L.getZExtValue() > Idx0H.getZExtValue()) {
57701 std::swap(Idx0L, Idx0H);
57702 std::swap(Idx1L, Idx1H);
57703 }
57704 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
57705 Idx1H != 2 * i + 3)
57706 return SDValue();
57707 if (!Mul) {
57708 // First time an extract_elt's source vector is visited. Must be a MUL
57709 // with 2X number of vector elements than the BUILD_VECTOR.
57710 // Both extracts must be from same MUL.
57711 Mul = Vec0L;
57712 if (Mul.getOpcode() != ISD::MUL ||
57713 Mul.getValueType().getVectorNumElements() != 2 * e)
57714 return SDValue();
57715 }
57716 // Check that the extract is from the same MUL previously seen.
57717 if (Mul != Vec0L || Mul != Vec1L || Mul != Vec0H || Mul != Vec1H)
57718 return SDValue();
57719 }
57720
57721 // Check if the Mul source can be safely shrunk.
57723 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
57725 return SDValue();
57726
57727 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57728 VT.getVectorNumElements() * 2);
57729 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
57730 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
57731
57732 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57734 EVT InVT = Ops[0].getValueType();
57735 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
57736 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57737 InVT.getVectorNumElements() / 2);
57738 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57739 };
57740 SDValue R = SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDBuilder);
57741 if (Accum)
57742 R = DAG.getNode(ISD::ADD, DL, VT, R, Accum);
57743 return R;
57744}
57745
57746// Attempt to turn this pattern into PMADDWD.
57747// (add (mul (sext (build_vector)), (sext (build_vector))),
57748// (mul (sext (build_vector)), (sext (build_vector)))
57750 const SDLoc &DL, EVT VT,
57751 const X86Subtarget &Subtarget) {
57752 using namespace SDPatternMatch;
57753
57754 if (!Subtarget.hasSSE2())
57755 return SDValue();
57756
57757 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57758 VT.getVectorNumElements() < 4 ||
57760 return SDValue();
57761
57762 // All inputs need to be sign extends.
57763 // TODO: Support ZERO_EXTEND from known positive?
57764 SDValue N00, N01, N10, N11;
57765 if (!sd_match(N, m_Add(m_Mul(m_SExt(m_Value(N00)), m_SExt(m_Value(N01))),
57766 m_Mul(m_SExt(m_Value(N10)), m_SExt(m_Value(N11))))))
57767 return SDValue();
57768
57769 // Must be extending from vXi16.
57770 EVT InVT = N00.getValueType();
57771 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
57772 N10.getValueType() != InVT || N11.getValueType() != InVT)
57773 return SDValue();
57774
57775 // All inputs should be build_vectors.
57776 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
57777 N01.getOpcode() != ISD::BUILD_VECTOR ||
57778 N10.getOpcode() != ISD::BUILD_VECTOR ||
57780 return SDValue();
57781
57782 // For each element, we need to ensure we have an odd element from one vector
57783 // multiplied by the odd element of another vector and the even element from
57784 // one of the same vectors being multiplied by the even element from the
57785 // other vector. So we need to make sure for each element i, this operator
57786 // is being performed:
57787 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
57788 SDValue In0, In1;
57789 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
57790 SDValue N00Elt = N00.getOperand(i);
57791 SDValue N01Elt = N01.getOperand(i);
57792 SDValue N10Elt = N10.getOperand(i);
57793 SDValue N11Elt = N11.getOperand(i);
57794 // TODO: Be more tolerant to undefs.
57795 SDValue N00In, N01In, N10In, N11In;
57796 APInt IdxN00, IdxN01, IdxN10, IdxN11;
57797 if (!sd_match(N00Elt, m_ExtractElt(m_Value(N00In), m_ConstInt(IdxN00))) ||
57798 !sd_match(N01Elt, m_ExtractElt(m_Value(N01In), m_ConstInt(IdxN01))) ||
57799 !sd_match(N10Elt, m_ExtractElt(m_Value(N10In), m_ConstInt(IdxN10))) ||
57800 !sd_match(N11Elt, m_ExtractElt(m_Value(N11In), m_ConstInt(IdxN11))))
57801 return SDValue();
57802 // Add is commutative so indices can be reordered.
57803 if (IdxN00.getZExtValue() > IdxN10.getZExtValue()) {
57804 std::swap(IdxN00, IdxN10);
57805 std::swap(IdxN01, IdxN11);
57806 }
57807 // N0 indices be the even element. N1 indices must be the next odd element.
57808 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i ||
57809 IdxN11 != 2 * i + 1)
57810 return SDValue();
57811
57812 // First time we find an input capture it.
57813 if (!In0) {
57814 In0 = N00In;
57815 In1 = N01In;
57816
57817 // The input vectors must be at least as wide as the output.
57818 // If they are larger than the output, we extract subvector below.
57819 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
57820 In1.getValueSizeInBits() < VT.getSizeInBits())
57821 return SDValue();
57822 }
57823 // Mul is commutative so the input vectors can be in any order.
57824 // Canonicalize to make the compares easier.
57825 if (In0 != N00In)
57826 std::swap(N00In, N01In);
57827 if (In0 != N10In)
57828 std::swap(N10In, N11In);
57829 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
57830 return SDValue();
57831 }
57832
57833 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57835 EVT OpVT = Ops[0].getValueType();
57836 assert(OpVT.getScalarType() == MVT::i16 &&
57837 "Unexpected scalar element type");
57838 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
57839 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57840 OpVT.getVectorNumElements() / 2);
57841 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57842 };
57843
57844 // If the output is narrower than an input, extract the low part of the input
57845 // vector.
57846 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57847 VT.getVectorNumElements() * 2);
57848 if (OutVT16.bitsLT(In0.getValueType())) {
57849 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
57850 DAG.getVectorIdxConstant(0, DL));
57851 }
57852 if (OutVT16.bitsLT(In1.getValueType())) {
57853 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
57854 DAG.getVectorIdxConstant(0, DL));
57855 }
57856 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
57857 PMADDBuilder);
57858}
57859
57860// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
57861// If upper element in each pair of both VPMADDWD are zero then we can merge
57862// the operand elements and use the implicit add of VPMADDWD.
57863// TODO: Add support for VPMADDUBSW (which isn't commutable).
57865 const SDLoc &DL, EVT VT) {
57866 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
57867 return SDValue();
57868
57869 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
57870 if (VT.getSizeInBits() > 128)
57871 return SDValue();
57872
57873 unsigned NumElts = VT.getVectorNumElements();
57874 MVT OpVT = N0.getOperand(0).getSimpleValueType();
57876 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
57877
57878 bool Op0HiZero =
57879 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
57880 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
57881 bool Op1HiZero =
57882 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
57883 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
57884
57885 // TODO: Check for zero lower elements once we have actual codegen that
57886 // creates them.
57887 if (!Op0HiZero || !Op1HiZero)
57888 return SDValue();
57889
57890 // Create a shuffle mask packing the lower elements from each VPMADDWD.
57891 SmallVector<int> Mask;
57892 for (int i = 0; i != (int)NumElts; ++i) {
57893 Mask.push_back(2 * i);
57894 Mask.push_back(2 * (i + NumElts));
57895 }
57896
57897 SDValue LHS =
57898 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
57899 SDValue RHS =
57900 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
57901 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
57902}
57903
57904/// CMOV of constants requires materializing constant operands in registers.
57905/// Try to fold those constants into an 'add' instruction to reduce instruction
57906/// count. We do this with CMOV rather the generic 'select' because there are
57907/// earlier folds that may be used to turn select-of-constants into logic hacks.
57909 SelectionDAG &DAG,
57910 const X86Subtarget &Subtarget) {
57911 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
57912 // better because we eliminate 1-2 instructions. This transform is still
57913 // an improvement without zero operands because we trade 2 move constants and
57914 // 1 add for 2 adds (LEA) as long as the constants can be represented as
57915 // immediate asm operands (fit in 32-bits).
57916 auto isSuitableCmov = [](SDValue V) {
57917 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
57918 return false;
57919 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
57920 !isa<ConstantSDNode>(V.getOperand(1)))
57921 return false;
57922 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
57923 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
57924 V.getConstantOperandAPInt(1).isSignedIntN(32));
57925 };
57926
57927 // Match an appropriate CMOV as the first operand of the add.
57928 SDValue Cmov = N->getOperand(0);
57929 SDValue OtherOp = N->getOperand(1);
57930 if (!isSuitableCmov(Cmov))
57931 std::swap(Cmov, OtherOp);
57932 if (!isSuitableCmov(Cmov))
57933 return SDValue();
57934
57935 // Don't remove a load folding opportunity for the add. That would neutralize
57936 // any improvements from removing constant materializations.
57937 if (X86::mayFoldLoad(OtherOp, Subtarget))
57938 return SDValue();
57939
57940 EVT VT = N->getValueType(0);
57941 SDValue FalseOp = Cmov.getOperand(0);
57942 SDValue TrueOp = Cmov.getOperand(1);
57943
57944 // We will push the add through the select, but we can potentially do better
57945 // if we know there is another add in the sequence and this is pointer math.
57946 // In that case, we can absorb an add into the trailing memory op and avoid
57947 // a 3-operand LEA which is likely slower than a 2-operand LEA.
57948 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
57949 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
57950 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
57951 all_of(N->users(), [&](SDNode *Use) {
57952 auto *MemNode = dyn_cast<MemSDNode>(Use);
57953 return MemNode && MemNode->getBasePtr().getNode() == N;
57954 })) {
57955 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
57956 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
57957 // it is possible that choosing op1 might be better.
57958 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
57959 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
57960 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
57961 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
57962 Cmov.getOperand(2), Cmov.getOperand(3));
57963 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
57964 }
57965
57966 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
57967 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
57968 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
57969 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
57970 Cmov.getOperand(3));
57971}
57972
57975 const X86Subtarget &Subtarget) {
57976 using namespace SDPatternMatch;
57977 EVT VT = N->getValueType(0);
57978 SDValue Op0 = N->getOperand(0);
57979 SDValue Op1 = N->getOperand(1);
57980 SDLoc DL(N);
57981
57982 if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget))
57983 return Select;
57984
57985 if (SDValue MAdd = matchPMADDWD(DAG, N, DL, VT, Subtarget))
57986 return MAdd;
57987 if (SDValue MAdd = matchPMADDWD_2(DAG, N, DL, VT, Subtarget))
57988 return MAdd;
57989 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
57990 return MAdd;
57991
57992 // Try to synthesize horizontal adds from adds of shuffles.
57993 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
57994 return V;
57995
57996 // Canonicalize hidden LEA pattern:
57997 // Fold (add (sub (shl x, c), y), z) -> (sub (add (shl x, c), z), y)
57998 // iff c < 4
57999 if (VT == MVT::i32 || VT == MVT::i64) {
58000 SDValue Y, Z, Shift;
58001 APInt Amt;
58002 if (sd_match(
58004 m_Shl(m_Value(), m_ConstInt(Amt))),
58005 m_Value(Y))),
58006 m_Value(Z))) &&
58007 Amt.ult(4) && !isa<ConstantSDNode>(Z)) {
58008 return DAG.getNode(ISD::SUB, DL, VT,
58009 DAG.getNode(ISD::ADD, DL, VT, Shift, Z), Y);
58010 }
58011 }
58012
58013 SDValue X, Y;
58014
58015 // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
58016 // iff X and Y won't overflow.
58017 if (sd_match(Op0, m_c_BinOp(X86ISD::PSADBW, m_Value(X), m_Zero())) &&
58019 DAG.willNotOverflowAdd(/*IsSigned=*/false, X, Y)) {
58020 MVT OpVT = X.getSimpleValueType();
58021 SDValue Sum = DAG.getNode(ISD::ADD, DL, OpVT, X, Y);
58022 return DAG.getNode(X86ISD::PSADBW, DL, VT, Sum,
58023 getZeroVector(OpVT, Subtarget, DAG, DL));
58024 }
58025
58026 if (VT.isVector()) {
58027 EVT BoolVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
58029
58030 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
58031 // (sub Y, (sext (vXi1 X))).
58032 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y)
58033 // in generic DAG combine without a legal type check, but adding this there
58034 // caused regressions.
58035 if (DAG.getTargetLoweringInfo().isTypeLegal(BoolVT) &&
58037 m_Value(Y)))) {
58038 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, X);
58039 return DAG.getNode(ISD::SUB, DL, VT, Y, SExt);
58040 }
58041
58042 // Fold (add X, (srl Y, 7)) -> (sub X, (icmp_sgt 0, Y)) to undo instcombine
58043 // canonicalisation as we don't have good vXi8 shifts.
58044 if (VT.getScalarType() == MVT::i8 &&
58046 SDValue Cmp =
58047 DAG.getSetCC(DL, BoolVT, DAG.getConstant(0, DL, VT), Y, ISD::SETGT);
58048 return DAG.getNode(ISD::SUB, DL, VT, X, DAG.getSExtOrTrunc(Cmp, DL, VT));
58049 }
58050 }
58051
58052 // Peephole for 512-bit VPDPBSSD on non-VLX targets.
58053 // TODO: Should this be part of matchPMADDWD/matchPMADDWD_2?
58054 if (Subtarget.hasVNNI() && Subtarget.useAVX512Regs() && VT == MVT::v16i32) {
58055 SDValue Accum, Lo0, Lo1, Hi0, Hi1;
58056 if (sd_match(N, m_Add(m_Value(Accum),
58059 m_Value(Lo1)),
58061 m_Value(Hi1)))))) {
58062 return DAG.getNode(X86ISD::VPDPWSSD, DL, VT, Accum,
58063 concatSubVectors(Lo0, Hi0, DAG, DL),
58064 concatSubVectors(Lo1, Hi1, DAG, DL));
58065 }
58066 }
58067
58068 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
58069 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
58070 X86::isZeroNode(Op0.getOperand(1))) {
58071 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
58072 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
58073 Op0.getOperand(0), Op0.getOperand(2));
58074 }
58075
58076 return combineAddOrSubToADCOrSBB(N, DL, DAG);
58077}
58078
58079// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
58080// condition comes from the subtract node that produced -X. This matches the
58081// cmov expansion for absolute value. By swapping the operands we convert abs
58082// to nabs.
58083static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1,
58084 SelectionDAG &DAG) {
58085 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
58086 return SDValue();
58087
58088 SDValue Cond = N1.getOperand(3);
58089 if (Cond.getOpcode() != X86ISD::SUB)
58090 return SDValue();
58091 assert(Cond.getResNo() == 1 && "Unexpected result number");
58092
58093 SDValue FalseOp = N1.getOperand(0);
58094 SDValue TrueOp = N1.getOperand(1);
58096
58097 // ABS condition should come from a negate operation.
58098 if ((CC == X86::COND_S || CC == X86::COND_NS) &&
58099 isNullConstant(Cond.getOperand(0))) {
58100 // Get the X and -X from the negate.
58101 SDValue NegX = Cond.getValue(0);
58102 SDValue X = Cond.getOperand(1);
58103
58104 // Cmov operands should be X and NegX. Order doesn't matter.
58105 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
58106 return SDValue();
58107
58108 // Build a new CMOV with the operands swapped.
58109 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
58110 N1.getOperand(2), Cond);
58111 // Convert sub to add.
58112 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
58113 }
58114
58115 // Handle ABD special case:
58116 // NEG(ABD(X,Y)) -> NEG(CMOV(SUB(X,Y),SUB(Y,X))) -> CMOV(SUB(Y,X),SUB(X,Y)).
58117 // ABD condition should come from a pair of matching subtracts.
58118 if ((CC == X86::COND_L || CC == X86::COND_B) && isNullConstant(N0) &&
58119 (FalseOp == Cond.getValue(0) || TrueOp == Cond.getValue(0)) &&
58120 (TrueOp.getOpcode() == ISD::SUB || TrueOp.getOpcode() == X86ISD::SUB) &&
58121 (FalseOp.getOpcode() == ISD::SUB || FalseOp.getOpcode() == X86ISD::SUB) &&
58122 (TrueOp.getOperand(0) == FalseOp.getOperand(1)) &&
58123 (TrueOp.getOperand(1) == FalseOp.getOperand(0))) {
58124 // Build a new CMOV with the operands swapped.
58125 return DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp, N1.getOperand(2),
58126 Cond);
58127 }
58128
58129 return SDValue();
58130}
58131
58133 SDValue Op0 = N->getOperand(0);
58134 SDValue Op1 = N->getOperand(1);
58135
58136 // (sub C (zero_extend (setcc)))
58137 // =>
58138 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
58139 // Don't disturb (sub 0 setcc), which is easily done with neg.
58140 EVT VT = N->getValueType(0);
58141 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
58142 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
58143 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
58144 Op1.getOperand(0).hasOneUse()) {
58145 SDValue SetCC = Op1.getOperand(0);
58148 APInt NewImm = Op0C->getAPIntValue() - 1;
58149 SDLoc DL(Op1);
58150 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
58151 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
58152 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
58153 DAG.getConstant(NewImm, DL, VT));
58154 }
58155
58156 return SDValue();
58157}
58158
58160 if (N->getConstantOperandVal(3) != X86::COND_NE)
58161 return SDValue();
58162
58163 SDValue Sub = N->getOperand(4);
58164 if (Sub.getOpcode() != X86ISD::SUB)
58165 return SDValue();
58166
58167 SDValue Op1 = Sub.getOperand(1);
58168
58169 if (!X86::isZeroNode(Sub.getOperand(0)))
58170 return SDValue();
58171
58172 SDLoc DL(N);
58173 SmallVector<SDValue, 5> Ops(N->op_values());
58174 if (Op1.getOpcode() == X86ISD::SETCC) {
58175 // res, flags2 = sub 0, (setcc cc, flag)
58176 // cload/cstore ..., cond_ne, flag2
58177 // ->
58178 // cload/cstore cc, flag
58179 Ops[3] = Op1.getOperand(0);
58180 Ops[4] = Op1.getOperand(1);
58181 } else if (Op1.getOpcode() == ISD::AND && Sub.getValue(0).use_empty()) {
58182 SDValue Src = Op1;
58183 SDValue Op10 = Op1.getOperand(0);
58184 if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1))) {
58185 // res, flags2 = sub 0, (and (xor X, -1), Y)
58186 // cload/cstore ..., cond_ne, flag2
58187 // ->
58188 // res, flags2 = sub 0, (and X, Y)
58189 // cload/cstore ..., cond_e, flag2
58190 Src = DAG.getNode(ISD::AND, DL, Op1.getValueType(), Op10.getOperand(0),
58191 Op1.getOperand(1));
58192 Ops[3] = DAG.getTargetConstant(X86::COND_E, DL, MVT::i8);
58193 }
58194 // res, flags2 = sub 0, (and X, Y)
58195 // cload/cstore ..., cc, flag2
58196 // ->
58197 // res, flags2 = cmp (and X, Y), 0
58198 // cload/cstore ..., cc, flag2
58199 Ops[4] = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Src, Sub.getOperand(0));
58200 } else {
58201 return SDValue();
58202 }
58203
58204 return DAG.getMemIntrinsicNode(N->getOpcode(), DL, N->getVTList(), Ops,
58205 cast<MemSDNode>(N)->getMemoryVT(),
58206 cast<MemSDNode>(N)->getMemOperand());
58207}
58208
58211 const X86Subtarget &Subtarget) {
58212 EVT VT = N->getValueType(0);
58213 SDValue Op0 = N->getOperand(0);
58214 SDValue Op1 = N->getOperand(1);
58215 SDLoc DL(N);
58216
58217 auto IsNonOpaqueConstant = [&](SDValue Op) {
58219 /*AllowOpaques*/ false);
58220 };
58221
58222 // X86 can't encode an immediate LHS of a sub. See if we can push the
58223 // negation into a preceding instruction. If the RHS of the sub is a XOR with
58224 // one use and a constant, invert the immediate, saving one register.
58225 // However, ignore cases where C1 is 0, as those will become a NEG.
58226 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
58227 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
58228 !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
58229 Op1->hasOneUse()) {
58230 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
58231 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
58232 SDValue NewAdd =
58233 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
58234 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
58235 }
58236
58237 if (SDValue V = combineSubABS(VT, DL, Op0, Op1, DAG))
58238 return V;
58239
58240 // Try to synthesize horizontal subs from subs of shuffles.
58241 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
58242 return V;
58243
58244 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
58245 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
58246 X86::isZeroNode(Op1.getOperand(1))) {
58247 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58248 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
58249 Op1.getOperand(0), Op1.getOperand(2));
58250 }
58251
58252 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
58253 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
58254 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
58255 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
58256 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58257 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
58258 Op1.getOperand(1), Op1.getOperand(2));
58259 return DAG.getNode(ISD::SUB, DL, VT, ADC.getValue(0), Op1.getOperand(0));
58260 }
58261
58262 if (SDValue V = combineXorSubCTLZ(N, DL, DAG, Subtarget))
58263 return V;
58264
58265 if (SDValue V = combineAddOrSubToADCOrSBB(N, DL, DAG))
58266 return V;
58267
58268 return combineSubSetcc(N, DAG);
58269}
58270
58272 const X86Subtarget &Subtarget) {
58273 unsigned Opcode = N->getOpcode();
58274 assert((Opcode == X86ISD::PCMPEQ || Opcode == X86ISD::PCMPGT) &&
58275 "Unknown PCMP opcode");
58276
58277 SDValue LHS = N->getOperand(0);
58278 SDValue RHS = N->getOperand(1);
58279 MVT VT = N->getSimpleValueType(0);
58280 unsigned EltBits = VT.getScalarSizeInBits();
58281 unsigned NumElts = VT.getVectorNumElements();
58282 SDLoc DL(N);
58283
58284 if (LHS == RHS)
58285 return (Opcode == X86ISD::PCMPEQ) ? DAG.getAllOnesConstant(DL, VT)
58286 : DAG.getConstant(0, DL, VT);
58287
58288 // Constant Folding.
58289 // PCMPEQ(X,UNDEF) -> UNDEF
58290 // PCMPGT(X,UNDEF) -> 0
58291 // PCMPGT(UNDEF,X) -> 0
58292 APInt LHSUndefs, RHSUndefs;
58293 SmallVector<APInt> LHSBits, RHSBits;
58294 if (getTargetConstantBitsFromNode(LHS, EltBits, LHSUndefs, LHSBits) &&
58295 getTargetConstantBitsFromNode(RHS, EltBits, RHSUndefs, RHSBits)) {
58296 APInt Ones = APInt::getAllOnes(EltBits);
58297 APInt Zero = APInt::getZero(EltBits);
58298 SmallVector<APInt> Results(NumElts);
58299 for (unsigned I = 0; I != NumElts; ++I) {
58300 if (Opcode == X86ISD::PCMPEQ) {
58301 Results[I] = (LHSBits[I] == RHSBits[I]) ? Ones : Zero;
58302 } else {
58303 bool AnyUndef = LHSUndefs[I] || RHSUndefs[I];
58304 Results[I] = (!AnyUndef && LHSBits[I].sgt(RHSBits[I])) ? Ones : Zero;
58305 }
58306 }
58307 if (Opcode == X86ISD::PCMPEQ)
58308 return getConstVector(Results, LHSUndefs | RHSUndefs, VT, DAG, DL);
58309 return getConstVector(Results, VT, DAG, DL);
58310 }
58311
58312 return SDValue();
58313}
58314
58315// Helper to determine if we can convert an integer comparison to a float
58316// comparison byt casting the operands.
58317static std::optional<unsigned>
58318CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS,
58319 unsigned NumSignificantBitsRHS) {
58320 MVT SVT = VT.getScalarType();
58321 assert(SVT == MVT::f32 && "Only tested for float so far");
58322 const fltSemantics &Sem = SVT.getFltSemantics();
58323 assert((CC == ISD::SETEQ || CC == ISD::SETGT) &&
58324 "Only PCMPEQ/PCMPGT currently supported");
58325
58326 // TODO: Handle bitcastable integers.
58327
58328 // For cvt + signed compare we need lhs and rhs to be exactly representable as
58329 // a fp value.
58330 unsigned FPPrec = APFloat::semanticsPrecision(Sem);
58331 if (FPPrec >= NumSignificantBitsLHS && FPPrec >= NumSignificantBitsRHS)
58332 return ISD::SINT_TO_FP;
58333
58334 return std::nullopt;
58335}
58336
58337/// Helper that combines an array of subvector ops as if they were the operands
58338/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
58339/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
58342 const X86Subtarget &Subtarget,
58343 unsigned Depth) {
58344 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
58345 unsigned EltSizeInBits = VT.getScalarSizeInBits();
58346
58347 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
58348 return DAG.getUNDEF(VT);
58349
58350 if (llvm::all_of(Ops, [](SDValue Op) {
58351 return Op.isUndef() || ISD::isBuildVectorAllZeros(Op.getNode());
58352 }))
58353 return getZeroVector(VT, Subtarget, DAG, DL);
58354
58356 return SDValue(); // Limit search depth.
58357
58358 SDValue Op0 = Ops[0];
58359 bool IsSplat = llvm::all_equal(Ops);
58360 unsigned NumOps = Ops.size();
58361 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58362 LLVMContext &Ctx = *DAG.getContext();
58363
58364 // Repeated subvectors.
58365 if (IsSplat &&
58366 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
58367 // If this broadcast is inserted into both halves, use a larger broadcast.
58368 if (Op0.getOpcode() == X86ISD::VBROADCAST)
58369 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
58370
58371 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
58372 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
58373 (Subtarget.hasAVX2() ||
58375 VT.getScalarType(), Subtarget)))
58376 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
58377 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
58378 Op0.getOperand(0),
58379 DAG.getVectorIdxConstant(0, DL)));
58380
58381 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
58382 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
58383 (Subtarget.hasAVX2() ||
58384 (EltSizeInBits >= 32 &&
58385 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
58386 Op0.getOperand(0).getValueType() == VT.getScalarType())
58387 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
58388
58389 // concat_vectors(extract_subvector(splat(x)),
58390 // extract_subvector(splat(x))) -> splat(x)
58391 // concat_vectors(extract_subvector(subv_broadcast(x)),
58392 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
58393 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58394 Op0.getOperand(0).getValueType() == VT) {
58395 SDValue SrcVec = Op0.getOperand(0);
58396 if (DAG.isSplatValue(SrcVec, /*AllowUndefs*/ false))
58397 return SrcVec;
58398 if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
58399 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
58400 return SrcVec;
58401 }
58402
58403 // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
58404 if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&
58405 !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))
58406 return DAG.getNode(Op0.getOpcode(), DL, VT,
58408 Op0.getOperand(0), Op0.getOperand(0)),
58409 Op0.getOperand(1));
58410 }
58411
58412 // TODO: This should go in combineX86ShufflesRecursively eventually.
58413 if (NumOps == 2) {
58414 SDValue Src0 = peekThroughBitcasts(Ops[0]);
58415 SDValue Src1 = peekThroughBitcasts(Ops[1]);
58416 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58418 EVT SrcVT0 = Src0.getOperand(0).getValueType();
58419 EVT SrcVT1 = Src1.getOperand(0).getValueType();
58420 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
58421 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
58422 const APInt &SrcIdx0 = Src0.getConstantOperandAPInt(1);
58423 const APInt &SrcIdx1 = Src1.getConstantOperandAPInt(1);
58424 // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
58425 // Only concat of subvector high halves which vperm2x128 is best at or if
58426 // it should fold into a subvector broadcast.
58427 if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
58428 SrcVT1.is256BitVector()) {
58429 assert((SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58430 (SrcIdx1 == 0 || SrcIdx1 == (NumSrcElts1 / 2)) &&
58431 "Bad subvector index");
58432 if ((SrcIdx0 == (NumSrcElts0 / 2) && SrcIdx1 == (NumSrcElts1 / 2)) ||
58433 (IsSplat && ISD::isNormalLoad(Src0.getOperand(0).getNode()))) {
58434 unsigned Index = 0;
58435 Index |= SrcIdx0 == 0 ? 0x00 : 0x01;
58436 Index |= SrcIdx1 == 0 ? 0x20 : 0x30;
58437 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58438 DAG.getBitcast(VT, Src0.getOperand(0)),
58439 DAG.getBitcast(VT, Src1.getOperand(0)),
58440 DAG.getTargetConstant(Index, DL, MVT::i8));
58441 }
58442 }
58443 // Widen extract_subvector
58444 // concat(extract_subvector(x,lo), extract_subvector(x,hi))
58445 // --> extract_subvector(x,lo)
58446 unsigned NumSubElts0 = Src0.getValueType().getVectorNumElements();
58447 if (Src0.getOperand(0) == Src1.getOperand(0) &&
58448 (SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58449 SrcIdx1 == (SrcIdx0 + NumSubElts0)) {
58450 return DAG.getBitcast(VT,
58452 Src0.getConstantOperandVal(1),
58453 DAG, DL, VT.getSizeInBits()));
58454 }
58455 }
58456 }
58457
58458 // Repeated opcode.
58459 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
58460 // but it currently struggles with different vector widths.
58461 if (llvm::all_of(Ops, [Op0](SDValue Op) {
58462 return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
58463 })) {
58464 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58466 for (SDValue SubOp : SubOps)
58467 Subs.push_back(SubOp.getOperand(I));
58468 // Attempt to peek through bitcasts and concat the original subvectors.
58469 EVT SubVT = peekThroughBitcasts(Subs[0]).getValueType();
58470 if (SubVT.isSimple() && SubVT.isVector()) {
58471 MVT ConcatVT =
58473 SubVT.getVectorElementCount() * Subs.size());
58474 for (SDValue &Sub : Subs)
58475 Sub = DAG.getBitcast(SubVT, Sub);
58476 if (SDValue ConcatSrc = combineConcatVectorOps(DL, ConcatVT, Subs, DAG,
58477 Subtarget, Depth + 1))
58478 return DAG.getBitcast(VT, ConcatSrc);
58479 return DAG.getBitcast(
58480 VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, Subs));
58481 }
58482 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58483 };
58484 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
58485 bool AllConstants = true;
58486 bool AllSubs = true;
58487 unsigned VecSize = VT.getSizeInBits();
58488 SDValue BC0 = peekThroughBitcasts(SubOps[0].getOperand(Op));
58489 if (isa<LoadSDNode>(BC0) && all_of(SubOps, [&](SDValue SubOp) {
58490 return BC0 == peekThroughBitcasts(SubOp.getOperand(Op));
58491 }))
58492 return true;
58493 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
58494 SDValue BC = peekThroughBitcasts(SubOps[I].getOperand(Op));
58495 unsigned SubSize = BC.getValueSizeInBits();
58496 unsigned EltSize = BC.getScalarValueSizeInBits();
58497 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58499 AllSubs &= BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58500 BC.getOperand(0).getValueSizeInBits() == VecSize &&
58501 (BC.getConstantOperandVal(1) * EltSize) == (I * SubSize);
58502 }
58503 return AllConstants || AllSubs;
58504 };
58505 auto CombineSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58506 bool AllConstants = true;
58508 for (SDValue SubOp : SubOps) {
58509 SDValue BC = peekThroughBitcasts(SubOp.getOperand(I));
58510 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58512 Subs.push_back(SubOp.getOperand(I));
58513 }
58514 if (AllConstants)
58515 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58516 return combineConcatVectorOps(DL, VT, Subs, DAG, Subtarget, Depth + 1);
58517 };
58518
58519 unsigned Opcode = Op0.getOpcode();
58520 switch (Opcode) {
58521 case ISD::BITCAST: {
58522 // TODO: Support AVX1/AVX2 bitcasts.
58524 for (SDValue SubOp : Ops)
58525 SubOps.push_back(peekThroughBitcasts(SubOp.getOperand(0)));
58526 EVT InnerVT = SubOps[0].getValueType();
58527 unsigned InnerSizeInBits = InnerVT.getScalarSizeInBits();
58528 if (!IsSplat && InnerVT.isSimple() && InnerVT.isVector() &&
58529 (Subtarget.hasBWI() ||
58530 (EltSizeInBits >= 32 && InnerSizeInBits >= 32)) &&
58531 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
58532 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58533 llvm::all_of(SubOps, [InnerVT](SDValue Op) {
58534 return Op.getValueType() == InnerVT;
58535 })) {
58536 MVT ConcatSVT = InnerVT.getScalarType().getSimpleVT();
58537 MVT ConcatVT = MVT::getVectorVT(
58538 ConcatSVT, VT.getSizeInBits() / ConcatSVT.getSizeInBits());
58539 if (SDValue ConcatSrc = combineConcatVectorOps(
58540 DL, ConcatVT, SubOps, DAG, Subtarget, Depth + 1))
58541 return DAG.getBitcast(VT, ConcatSrc);
58542 }
58543 break;
58544 }
58545 case ISD::VECTOR_SHUFFLE: {
58546 // TODO: Generalize NumOps support.
58547 if (!IsSplat && NumOps == 2 &&
58548 ((VT.is256BitVector() &&
58549 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58550 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58551 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58552 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58553 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58554 if (Concat0 || Concat1 ||
58555 (Ops[0].getOperand(0) == Ops[1].getOperand(0) &&
58556 Ops[0].getOperand(1) == Ops[1].getOperand(1) &&
58557 Subtarget.hasVBMI())) {
58558 int NumSubElts = Op0.getValueType().getVectorNumElements();
58559 SmallVector<int> NewMask;
58560 for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
58561 M = M >= NumSubElts ? M + NumSubElts : M;
58562 NewMask.push_back(M);
58563 }
58564 for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
58565 if (0 <= M)
58566 M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts;
58567 NewMask.push_back(M);
58568 }
58569 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
58570 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
58571 return DAG.getVectorShuffle(VT, DL, Concat0, Concat1, NewMask);
58572 }
58573 }
58574 break;
58575 }
58576 case X86ISD::VBROADCAST: {
58577 // TODO: 512-bit VBROADCAST concatenation.
58578 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
58579 return Op.getOperand(0).getValueType().is128BitVector();
58580 })) {
58581 if (VT == MVT::v4f64 || VT == MVT::v4i64)
58582 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
58583 ConcatSubOperand(VT, Ops, 0),
58584 ConcatSubOperand(VT, Ops, 0));
58585 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
58586 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
58587 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
58589 DL, VT, ConcatSubOperand(VT, Ops, 0),
58590 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
58591 }
58592 break;
58593 }
58594 case X86ISD::MOVDDUP:
58595 case X86ISD::MOVSHDUP:
58596 case X86ISD::MOVSLDUP: {
58597 if (!IsSplat && (VT.is256BitVector() ||
58598 (VT.is512BitVector() && Subtarget.useAVX512Regs())))
58599 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
58600 break;
58601 }
58602 case X86ISD::SHUFP: {
58603 if (!IsSplat &&
58604 (VT == MVT::v8f32 ||
58605 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) &&
58606 llvm::all_of(Ops, [Op0](SDValue Op) {
58607 return Op.getOperand(2) == Op0.getOperand(2);
58608 })) {
58609 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58610 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58611 if (Concat0 || Concat1)
58612 return DAG.getNode(Opcode, DL, VT,
58613 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58614 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
58615 Op0.getOperand(2));
58616 }
58617 break;
58618 }
58619 case X86ISD::UNPCKH:
58620 case X86ISD::UNPCKL: {
58621 // TODO: UNPCK should use CombineSubOperand
58622 // Don't concatenate build_vector patterns.
58623 if (!IsSplat &&
58624 ((VT.is256BitVector() &&
58625 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58626 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58627 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58628 none_of(Ops, [](SDValue Op) {
58629 return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
58631 peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
58633 })) {
58634 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58635 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58636 if (Concat0 || Concat1 ||
58637 (Subtarget.hasInt256() && EltSizeInBits == 64))
58638 return DAG.getNode(Opcode, DL, VT,
58639 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58640 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58641 }
58642 break;
58643 }
58644 case X86ISD::PSHUFHW:
58645 case X86ISD::PSHUFLW:
58646 case X86ISD::PSHUFD:
58647 if (!IsSplat &&
58648 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58649 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58650 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58651 llvm::all_of(Ops, [Op0](SDValue Op) {
58652 return Op.getOperand(1) == Op0.getOperand(1);
58653 })) {
58654 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58655 Op0.getOperand(1));
58656 }
58657 [[fallthrough]];
58658 case X86ISD::VPERMILPI:
58659 if (!IsSplat && EltSizeInBits == 32 &&
58660 (VT.is256BitVector() ||
58661 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58662 all_of(Ops, [&Op0](SDValue Op) {
58663 return Op0.getOperand(1) == Op.getOperand(1);
58664 })) {
58665 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
58666 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
58667 Res =
58668 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
58669 return DAG.getBitcast(VT, Res);
58670 }
58671 break;
58672 case X86ISD::VPERMILPV:
58673 if (!IsSplat && (VT.is256BitVector() ||
58674 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
58675 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58676 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58677 if (Concat0 || Concat1)
58678 return DAG.getNode(Opcode, DL, VT,
58679 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58680 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58681 }
58682 break;
58683 case X86ISD::PSHUFB:
58684 case X86ISD::PSADBW:
58685 case X86ISD::VPMADDUBSW:
58686 case X86ISD::VPMADDWD:
58687 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58688 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
58689 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
58690 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
58691 NumOps * SrcVT.getVectorNumElements());
58692 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
58693 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
58694 if (Concat0 || Concat1)
58695 return DAG.getNode(
58696 Opcode, DL, VT,
58697 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
58698 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
58699 }
58700 break;
58701 case X86ISD::VPERMV:
58702 // TODO: Handle 256-bit and NumOps == 4 cases.
58703 if (!IsSplat && NumOps == 2 &&
58704 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58705 MVT OpVT = Op0.getSimpleValueType();
58706 int NumSrcElts = OpVT.getVectorNumElements();
58707 SmallVector<int, 64> ConcatMask;
58708 for (unsigned i = 0; i != NumOps; ++i) {
58709 SmallVector<int, 64> SubMask;
58711 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58712 break;
58713 for (int M : SubMask) {
58714 if (0 <= M)
58715 M += i * NumSrcElts;
58716 ConcatMask.push_back(M);
58717 }
58718 }
58719 if (ConcatMask.size() == (NumOps * NumSrcElts))
58720 return lowerShuffleWithPERMV(DL, VT, ConcatMask,
58721 ConcatSubOperand(VT, Ops, 1),
58722 DAG.getUNDEF(VT), Subtarget, DAG);
58723 }
58724 break;
58725 case X86ISD::VPERMV3:
58726 // TODO: Handle 256-bit and NumOps == 4 cases.
58727 if (!IsSplat && NumOps == 2 &&
58728 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58729 MVT OpVT = Op0.getSimpleValueType();
58730 int NumSrcElts = OpVT.getVectorNumElements();
58731 SmallVector<int, 64> ConcatMask;
58732 for (unsigned i = 0; i != NumOps; ++i) {
58733 SmallVector<int, 64> SubMask;
58735 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58736 break;
58737 for (int M : SubMask) {
58738 if (0 <= M) {
58739 int Src = M < NumSrcElts ? 0 : 2;
58740 M += M < NumSrcElts ? 0 : NumSrcElts;
58741
58742 // Reference the lowest sub if the upper sub is the same.
58743 if (Ops[0].getOperand(Src) != Ops[i].getOperand(Src))
58744 M += i * NumSrcElts;
58745 }
58746 ConcatMask.push_back(M);
58747 }
58748 }
58749 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
58750 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58751 SDValue Concat1 = CombineSubOperand(VT, Ops, 2);
58752 if (Concat0 || Concat1)
58753 return lowerShuffleWithPERMV(
58754 DL, VT, ConcatMask,
58755 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58756 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 2), Subtarget,
58757 DAG);
58758 }
58759 }
58760 break;
58761 case X86ISD::VPERM2X128: {
58762 if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
58763 assert(NumOps == 2 && "Bad concat_vectors operands");
58764 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58765 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58766 // TODO: Handle zero'd subvectors.
58767 if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
58768 int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
58769 (int)((Imm1 >> 4) & 0x3)};
58770 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
58771 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58772 Ops[0].getOperand(1), DAG, DL);
58773 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58774 Ops[1].getOperand(1), DAG, DL);
58775 SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
58776 DAG.getBitcast(ShuffleVT, LHS),
58777 DAG.getBitcast(ShuffleVT, RHS),
58778 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
58779 return DAG.getBitcast(VT, Res);
58780 }
58781 }
58782 break;
58783 }
58784 case X86ISD::SHUF128: {
58785 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
58786 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58787 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58788 unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |
58789 ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;
58790 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58791 Ops[0].getOperand(1), DAG, DL);
58792 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58793 Ops[1].getOperand(1), DAG, DL);
58794 return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,
58795 DAG.getTargetConstant(Imm, DL, MVT::i8));
58796 }
58797 break;
58798 }
58799 case ISD::TRUNCATE:
58800 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
58801 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58802 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
58803 SrcVT == Ops[1].getOperand(0).getValueType() &&
58804 Subtarget.useAVX512Regs() &&
58805 Subtarget.getPreferVectorWidth() >= 512 &&
58806 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
58807 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58808 return DAG.getNode(ISD::TRUNCATE, DL, VT,
58809 ConcatSubOperand(NewSrcVT, Ops, 0));
58810 }
58811 }
58812 break;
58813 case ISD::ANY_EXTEND:
58814 case ISD::SIGN_EXTEND:
58815 case ISD::ZERO_EXTEND:
58816 // TODO: Handle ANY_EXTEND combos with SIGN/ZERO_EXTEND.
58817 if (!IsSplat && NumOps == 2 &&
58818 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58819 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58820 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58821 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58822 if (SrcVT.isSimple() && SrcVT.is128BitVector() &&
58823 SrcVT == Ops[1].getOperand(0).getValueType()) {
58824 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58825 return DAG.getNode(Opcode, DL, VT,
58826 ConcatSubOperand(NewSrcVT, Ops, 0));
58827 }
58828 }
58829 break;
58833 // TODO: Handle ANY_EXTEND_INREG combos with SIGN/ZERO_EXTEND_INREG.
58834 if (!IsSplat && NumOps == 2 &&
58835 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58836 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58837 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58839 Op0.getOperand(0).getValueType() ==
58840 Ops[0].getOperand(0).getValueType()) {
58841 EVT SrcVT = Op0.getOperand(0).getValueType();
58842 unsigned NumElts = VT.getVectorNumElements();
58843 MVT UnpackSVT =
58844 MVT::getIntegerVT(SrcVT.getScalarSizeInBits() * (NumElts / 2));
58845 MVT UnpackVT =
58846 MVT::getVectorVT(UnpackSVT, 128 / UnpackSVT.getScalarSizeInBits());
58847 SDValue Unpack =
58848 DAG.getNode(X86ISD::UNPCKL, DL, UnpackVT,
58849 DAG.getBitcast(UnpackVT, Ops[0].getOperand(0)),
58850 DAG.getBitcast(UnpackVT, Ops[1].getOperand(0)));
58851 return getEXTEND_VECTOR_INREG(Opcode, DL, VT,
58852 DAG.getBitcast(SrcVT, Unpack), DAG);
58853 }
58854 break;
58855 }
58856 case X86ISD::VSHLI:
58857 case X86ISD::VSRLI:
58858 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
58859 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
58860 llvm::all_of(Ops, [](SDValue Op) {
58861 return Op.getConstantOperandAPInt(1) == 32;
58862 })) {
58863 if (SDValue Res = CombineSubOperand(VT, Ops, 0)) {
58864 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
58865 Res = DAG.getBitcast(MVT::v8i32, Res);
58866 if (Opcode == X86ISD::VSHLI) {
58867 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
58868 {8, 0, 8, 2, 8, 4, 8, 6});
58869 } else {
58870 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
58871 {1, 8, 3, 8, 5, 8, 7, 8});
58872 }
58873 return DAG.getBitcast(VT, Res);
58874 }
58875 }
58876 [[fallthrough]];
58877 case X86ISD::VSRAI:
58878 case X86ISD::VSHL:
58879 case X86ISD::VSRL:
58880 case X86ISD::VSRA:
58881 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
58882 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58883 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58884 llvm::all_of(Ops, [Op0](SDValue Op) {
58885 return Op0.getOperand(1) == Op.getOperand(1);
58886 })) {
58887 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58888 Op0.getOperand(1));
58889 }
58890 break;
58891 case X86ISD::VPERMI:
58892 case X86ISD::VROTLI:
58893 case X86ISD::VROTRI:
58894 if (!IsSplat &&
58895 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
58896 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58897 llvm::all_of(Ops, [Op0](SDValue Op) {
58898 return Op0.getOperand(1) == Op.getOperand(1);
58899 })) {
58900 assert(!(Opcode == X86ISD::VPERMI &&
58901 Op0.getValueType().is128BitVector()) &&
58902 "Illegal 128-bit X86ISD::VPERMI nodes");
58903 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58904 Op0.getOperand(1));
58905 }
58906 break;
58907 case ISD::AND:
58908 case ISD::OR:
58909 case ISD::XOR:
58910 case X86ISD::ANDNP:
58911 // TODO: AVX512 targets should only use CombineSubOperand like AVX1/2.
58912 if (!IsSplat && (VT.is256BitVector() ||
58913 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
58914 // Don't concatenate root AVX1 NOT patterns.
58915 // TODO: Allow NOT folding if Concat0 succeeds.
58916 if (Opcode == ISD::XOR && Depth == 0 && !Subtarget.hasInt256() &&
58917 llvm::all_of(Ops, [](SDValue X) {
58918 return ISD::isBuildVectorAllOnes(X.getOperand(1).getNode());
58919 }))
58920 break;
58921 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58922 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58923 if (Concat0 || Concat1 || Subtarget.useAVX512Regs())
58924 return DAG.getNode(Opcode, DL, VT,
58925 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58926 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58927 }
58928 break;
58929 case X86ISD::PCMPEQ:
58930 case X86ISD::PCMPGT:
58931 // TODO: 512-bit PCMPEQ/PCMPGT -> VPCMP+VPMOVM2 handling.
58932 if (!IsSplat && VT.is256BitVector() && Subtarget.hasInt256()) {
58933 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58934 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58935 if (Concat0 || Concat1)
58936 return DAG.getNode(Opcode, DL, VT,
58937 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58938 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58939 break;
58940 }
58941
58942 if (!IsSplat && VT == MVT::v8i32) {
58943 // Without AVX2, see if we can cast the values to v8f32 and use fcmp.
58944 // TODO: Handle v4f64 as well?
58945 unsigned MaxSigBitsLHS = 0, MaxSigBitsRHS = 0;
58946 for (unsigned I = 0; I != NumOps; ++I) {
58947 MaxSigBitsLHS =
58948 std::max(MaxSigBitsLHS,
58949 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(0)));
58950 MaxSigBitsRHS =
58951 std::max(MaxSigBitsRHS,
58952 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(1)));
58953 if (MaxSigBitsLHS == EltSizeInBits && MaxSigBitsRHS == EltSizeInBits)
58954 break;
58955 }
58956
58957 ISD::CondCode ICC =
58958 Opcode == X86ISD::PCMPEQ ? ISD::SETEQ : ISD::SETGT;
58959 ISD::CondCode FCC =
58961
58962 MVT FpSVT = MVT::getFloatingPointVT(EltSizeInBits);
58963 MVT FpVT = VT.changeVectorElementType(FpSVT);
58964
58965 if (std::optional<unsigned> CastOpc =
58966 CastIntSETCCtoFP(FpVT, ICC, MaxSigBitsLHS, MaxSigBitsRHS)) {
58967 SDValue LHS = CombineSubOperand(VT, Ops, 0);
58968 SDValue RHS = CombineSubOperand(VT, Ops, 1);
58969 LHS = LHS ? LHS : ConcatSubOperand(VT, Ops, 0);
58970 RHS = RHS ? RHS : ConcatSubOperand(VT, Ops, 1);
58971 LHS = DAG.getNode(*CastOpc, DL, FpVT, LHS);
58972 RHS = DAG.getNode(*CastOpc, DL, FpVT, RHS);
58973
58974 bool IsAlwaysSignaling;
58975 unsigned FSETCC =
58976 translateX86FSETCC(FCC, LHS, RHS, IsAlwaysSignaling);
58977 return DAG.getBitcast(
58978 VT, DAG.getNode(X86ISD::CMPP, DL, FpVT, LHS, RHS,
58979 DAG.getTargetConstant(FSETCC, DL, MVT::i8)));
58980 }
58981 }
58982 break;
58983 case ISD::CTPOP:
58984 case ISD::CTTZ:
58985 case ISD::CTLZ:
58988 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58989 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
58990 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
58991 }
58992 break;
58994 // TODO: GF2P8AFFINEQB should use CombineSubOperand.
58995 if (!IsSplat &&
58996 (VT.is256BitVector() ||
58997 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58998 llvm::all_of(Ops, [Op0](SDValue Op) {
58999 return Op0.getOperand(2) == Op.getOperand(2);
59000 })) {
59001 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59002 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
59003 }
59004 break;
59005 case ISD::ADD:
59006 case ISD::SUB:
59007 case ISD::MUL:
59008 // TODO: Add more integer binops?
59009 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59010 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
59011 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
59012 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59013 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59014 if (Concat0 || Concat1 || llvm::all_of(Ops, [](SDValue Op) {
59015 return Op.getOperand(0) == Op.getOperand(1);
59016 }))
59017 return DAG.getNode(Opcode, DL, VT,
59018 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59019 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59020 }
59021 break;
59022 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
59023 // their latency are short, so here we don't replace them unless we won't
59024 // introduce extra VINSERT.
59025 case ISD::FADD:
59026 case ISD::FSUB:
59027 case ISD::FMUL:
59028 if (!IsSplat && (VT.is256BitVector() ||
59029 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59030 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59031 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59032 if (Concat0 || Concat1)
59033 return DAG.getNode(Opcode, DL, VT,
59034 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59035 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59036 }
59037 break;
59038 // Always prefer to concatenate high latency FDIV instructions.
59039 case ISD::FDIV:
59040 if (!IsSplat && (VT.is256BitVector() ||
59041 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59042 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59043 ConcatSubOperand(VT, Ops, 1));
59044 }
59045 break;
59046 case X86ISD::HADD:
59047 case X86ISD::HSUB:
59048 case X86ISD::FHADD:
59049 case X86ISD::FHSUB:
59050 if (!IsSplat && VT.is256BitVector() &&
59051 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
59052 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59053 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59054 if (Concat0 || Concat1)
59055 return DAG.getNode(Opcode, DL, VT,
59056 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59057 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59058 }
59059 break;
59060 case X86ISD::PACKSS:
59061 case X86ISD::PACKUS:
59062 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59063 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
59064 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
59065 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
59066 NumOps * SrcVT.getVectorNumElements());
59067 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
59068 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
59069 if (Concat0 || Concat1)
59070 return DAG.getNode(
59071 Opcode, DL, VT,
59072 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
59073 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
59074 }
59075 break;
59076 case X86ISD::VSHLD:
59077 case X86ISD::VSHRD:
59078 case X86ISD::PALIGNR:
59079 if (!IsSplat &&
59080 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59081 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
59082 llvm::all_of(Ops, [Op0](SDValue Op) {
59083 return Op0.getOperand(2) == Op.getOperand(2);
59084 })) {
59085 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59086 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59087 if (Concat0 || Concat1)
59088 return DAG.getNode(Opcode, DL, VT,
59089 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59090 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59091 Op0.getOperand(2));
59092 }
59093 break;
59094 case X86ISD::BLENDI:
59095 if (VT.is256BitVector() && NumOps == 2 &&
59096 (EltSizeInBits >= 32 ||
59097 (Subtarget.hasInt256() &&
59098 Ops[0].getOperand(2) == Ops[1].getOperand(2)))) {
59099 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59100 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59101 if (Concat0 || Concat1) {
59102 unsigned NumElts = VT.getVectorNumElements();
59103 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59104 Mask.insertBits(getBLENDIBlendMask(Ops[1]), NumElts / 2);
59105 Mask = Mask.zextOrTrunc(8);
59106 return DAG.getNode(Opcode, DL, VT,
59107 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59108 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59109 DAG.getTargetConstant(Mask, DL, MVT::i8));
59110 }
59111 }
59112 // TODO: BWI targets should only use CombineSubOperand.
59113 if (((VT.is256BitVector() && Subtarget.hasVLX()) ||
59114 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59115 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())) {
59116 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59117 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59118 if (Concat0 || Concat1 || Subtarget.useBWIRegs()) {
59119 unsigned NumElts = VT.getVectorNumElements();
59120 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59121 for (unsigned I = 1; I != NumOps; ++I)
59122 Mask.insertBits(getBLENDIBlendMask(Ops[I]), I * (NumElts / NumOps));
59123 unsigned NumMaskBits = NumElts >= 8 ? NumElts : 8;
59124 Mask = Mask.zextOrTrunc(NumMaskBits);
59125 MVT MaskSVT = MVT::getIntegerVT(NumMaskBits);
59126 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumMaskBits);
59127 SDValue Sel =
59128 DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));
59129 Sel = extractSubVector(Sel, 0, DAG, DL, NumElts);
59130 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
59131 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
59132 return DAG.getSelect(DL, VT, Sel, Concat1, Concat0);
59133 }
59134 }
59135 break;
59136 case ISD::VSELECT:
59137 // TODO: VSELECT should use CombineSubOperand.
59138 if (!IsSplat && Subtarget.hasAVX512() &&
59139 (VT.is256BitVector() ||
59140 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59141 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
59142 EVT SelVT = Ops[0].getOperand(0).getValueType();
59143 if (SelVT.getVectorElementType() == MVT::i1) {
59144 SelVT = EVT::getVectorVT(Ctx, MVT::i1,
59145 NumOps * SelVT.getVectorNumElements());
59146 if (TLI.isTypeLegal(SelVT))
59147 return DAG.getNode(
59148 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59149 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59150 }
59151 }
59152 [[fallthrough]];
59153 case X86ISD::BLENDV:
59154 // TODO: BLENDV should use CombineSubOperand.
59155 if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&
59156 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
59157 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
59158 EVT SelVT = Ops[0].getOperand(0).getValueType();
59159 SelVT = SelVT.getDoubleNumVectorElementsVT(Ctx);
59160 if (TLI.isTypeLegal(SelVT))
59161 return DAG.getNode(
59162 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59163 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59164 }
59165 break;
59166 }
59167 }
59168
59169 // Fold subvector loads into one.
59170 // If needed, look through bitcasts to get to the load.
59171 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
59172 unsigned Fast;
59173 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
59174 if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
59175 *FirstLd->getMemOperand(), &Fast) &&
59176 Fast) {
59177 if (SDValue Ld =
59178 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
59179 return Ld;
59180 }
59181 }
59182
59183 // Attempt to fold target constant loads.
59184 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
59185 SmallVector<APInt> EltBits;
59186 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
59187 for (unsigned I = 0; I != NumOps; ++I) {
59188 APInt OpUndefElts;
59189 SmallVector<APInt> OpEltBits;
59190 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
59191 OpEltBits, /*AllowWholeUndefs*/ true,
59192 /*AllowPartialUndefs*/ false))
59193 break;
59194 EltBits.append(OpEltBits);
59195 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
59196 }
59197 if (EltBits.size() == VT.getVectorNumElements()) {
59198 Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx);
59199 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
59200 SDValue CV = DAG.getConstantPool(C, PVT);
59203 SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI);
59204 SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits());
59206 return Ld;
59207 }
59208 }
59209
59210 // If this simple subvector or scalar/subvector broadcast_load is inserted
59211 // into both halves, use a larger broadcast_load. Update other uses to use
59212 // an extracted subvector.
59213 if (IsSplat &&
59214 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
59215 if (ISD::isNormalLoad(Op0.getNode()) ||
59218 auto *Mem = cast<MemSDNode>(Op0);
59219 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
59222 if (SDValue BcastLd =
59223 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
59224 SDValue BcastSrc =
59225 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
59226 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
59227 return BcastLd;
59228 }
59229 }
59230 }
59231
59232 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
59233 if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
59234 Subtarget.useAVX512Regs()) {
59235 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
59236 SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
59237 Res = DAG.getBitcast(ShuffleVT, Res);
59238 Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,
59239 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
59240 return DAG.getBitcast(VT, Res);
59241 }
59242
59243 // We can always convert per-lane vXf64 shuffles into VSHUFPD.
59244 if (!IsSplat &&
59245 ((NumOps == 2 && VT == MVT::v4f64) ||
59246 (NumOps == 4 && VT == MVT::v8f64 && Subtarget.useAVX512Regs())) &&
59247 all_of(Ops, [](SDValue Op) { return Op.hasOneUse(); })) {
59248 // Collect the individual per-lane v2f64/v4f64 shuffles.
59249 MVT OpVT = Ops[0].getSimpleValueType();
59250 unsigned NumOpElts = OpVT.getVectorNumElements();
59253 if (all_of(seq<int>(NumOps), [&](int I) {
59254 return getTargetShuffleInputs(Ops[I], SrcOps[I], SrcMasks[I], DAG,
59255 Depth + 1) &&
59256 !is128BitLaneCrossingShuffleMask(OpVT, SrcMasks[I]) &&
59257 none_of(SrcMasks[I], isUndefOrZero) &&
59258 SrcMasks[I].size() == NumOpElts &&
59259 all_of(SrcOps[I], [&OpVT](SDValue V) {
59260 return V.getValueType() == OpVT;
59261 });
59262 })) {
59263 // Concatenate the shuffle masks into SHUFPD mask and collect subops.
59264 bool Unary = true;
59265 unsigned SHUFPDMask = 0;
59267 for (unsigned I = 0; I != NumOps; ++I) {
59268 LHS[I] = SrcOps[I][SrcMasks[I][0] / NumOpElts];
59269 RHS[I] = SrcOps[I][SrcMasks[I][1] / NumOpElts];
59270 Unary &= LHS[I] == RHS[I];
59271 for (unsigned J = 0; J != NumOpElts; ++J)
59272 SHUFPDMask |= (SrcMasks[I][J] & 1) << ((I * NumOpElts) + J);
59273 }
59274 // Concat SHUFPD LHS/RHS operands - if they match then it will become a
59275 // PERMILPD mask and we can always profitably concatenate them.
59276 SDValue Concat0 =
59277 combineConcatVectorOps(DL, VT, LHS, DAG, Subtarget, Depth + 1);
59278 SDValue Concat1 =
59279 combineConcatVectorOps(DL, VT, RHS, DAG, Subtarget, Depth + 1);
59280 if (Unary || Concat0 || Concat1) {
59281 Concat0 =
59282 Concat0 ? Concat0 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS);
59283 Concat1 =
59284 Concat1 ? Concat1 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS);
59285 return DAG.getNode(X86ISD::SHUFP, DL, VT, Concat0, Concat1,
59286 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
59287 }
59288 }
59289 }
59290
59291 return SDValue();
59292}
59293
59296 const X86Subtarget &Subtarget) {
59297 EVT VT = N->getValueType(0);
59298 EVT SrcVT = N->getOperand(0).getValueType();
59299 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59301
59302 if (VT.getVectorElementType() == MVT::i1) {
59303 // Attempt to constant fold.
59304 unsigned SubSizeInBits = SrcVT.getSizeInBits();
59306 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
59308 if (!C) break;
59309 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
59310 if (I == (E - 1)) {
59311 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
59312 if (TLI.isTypeLegal(IntVT))
59313 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
59314 }
59315 }
59316
59317 // Don't do anything else for i1 vectors.
59318 return SDValue();
59319 }
59320
59321 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
59322 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
59323 Subtarget))
59324 return R;
59325 }
59326
59327 return SDValue();
59328}
59329
59332 const X86Subtarget &Subtarget) {
59333 if (DCI.isBeforeLegalizeOps())
59334 return SDValue();
59335
59336 MVT OpVT = N->getSimpleValueType(0);
59337
59338 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
59339
59340 SDLoc dl(N);
59341 SDValue Vec = N->getOperand(0);
59342 SDValue SubVec = N->getOperand(1);
59343
59344 uint64_t IdxVal = N->getConstantOperandVal(2);
59345 MVT SubVecVT = SubVec.getSimpleValueType();
59346 int VecNumElts = OpVT.getVectorNumElements();
59347 int SubVecNumElts = SubVecVT.getVectorNumElements();
59348
59349 if (Vec.isUndef() && SubVec.isUndef())
59350 return DAG.getUNDEF(OpVT);
59351
59352 // Inserting undefs/zeros into zeros/undefs is a zero vector.
59353 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
59354 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
59355 return getZeroVector(OpVT, Subtarget, DAG, dl);
59356
59358 // If we're inserting into a zero vector and then into a larger zero vector,
59359 // just insert into the larger zero vector directly.
59360 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59362 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
59363 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59364 getZeroVector(OpVT, Subtarget, DAG, dl),
59365 SubVec.getOperand(1),
59366 DAG.getVectorIdxConstant(IdxVal + Idx2Val, dl));
59367 }
59368
59369 // If we're inserting into a zero vector and our input was extracted from an
59370 // insert into a zero vector of the same type and the extraction was at
59371 // least as large as the original insertion. Just insert the original
59372 // subvector into a zero vector.
59373 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
59374 isNullConstant(SubVec.getOperand(1)) &&
59376 SDValue Ins = SubVec.getOperand(0);
59377 if (isNullConstant(Ins.getOperand(2)) &&
59378 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
59379 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
59380 SubVecVT.getFixedSizeInBits())
59381 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59382 getZeroVector(OpVT, Subtarget, DAG, dl),
59383 Ins.getOperand(1), N->getOperand(2));
59384 }
59385 }
59386
59387 // Stop here if this is an i1 vector.
59388 if (IsI1Vector)
59389 return SDValue();
59390
59391 // Eliminate an intermediate vector widening:
59392 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
59393 // insert_subvector X, Y, Idx
59394 // TODO: This is a more general version of a DAGCombiner fold, can we move it
59395 // there?
59396 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59397 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
59398 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
59399 SubVec.getOperand(1), N->getOperand(2));
59400
59401 // If this is an insert of an extract, combine to a shuffle. Don't do this
59402 // if the insert or extract can be represented with a subregister operation.
59403 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59404 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
59405 (IdxVal != 0 ||
59406 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
59407 SDValue ExtSrc = SubVec.getOperand(0);
59408 int ExtIdxVal = SubVec.getConstantOperandVal(1);
59409 // Create a shuffle mask matching the extraction and insertion.
59410 SmallVector<int, 64> Mask(VecNumElts);
59411 std::iota(Mask.begin(), Mask.end(), 0);
59412 std::iota(Mask.begin() + IdxVal, Mask.begin() + IdxVal + SubVecNumElts,
59413 ExtIdxVal + VecNumElts);
59414 if (ExtIdxVal != 0)
59415 return DAG.getVectorShuffle(OpVT, dl, Vec, ExtSrc, Mask);
59416 // See if we can use a blend instead of extract/insert pair.
59417 SmallVector<int, 64> BlendMask(VecNumElts);
59418 std::iota(BlendMask.begin(), BlendMask.end(), 0);
59419 std::iota(BlendMask.begin() + IdxVal,
59420 BlendMask.begin() + IdxVal + SubVecNumElts, VecNumElts + IdxVal);
59421 if (isShuffleEquivalent(Mask, BlendMask, Vec, ExtSrc) &&
59422 VecNumElts == (2 * SubVecNumElts)) {
59423 assert((IdxVal % SubVecNumElts) == 0 && "Unaligned subvector insertion");
59424 if (OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
59425 SDValue Blend = DAG.getNode(
59426 X86ISD::BLENDI, dl, MVT::v8f32, DAG.getBitcast(MVT::v8f32, Vec),
59427 DAG.getBitcast(MVT::v8f32, ExtSrc),
59428 DAG.getTargetConstant(IdxVal == 0 ? 0x0F : 0xF0, dl, MVT::i8));
59429 return DAG.getBitcast(OpVT, Blend);
59430 } else if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) {
59431 MVT ShufVT = OpVT.isInteger() ? MVT::v8i64 : MVT::v8f64;
59432 SDValue Lo = DAG.getBitcast(ShufVT, IdxVal == 0 ? ExtSrc : Vec);
59433 SDValue Hi = DAG.getBitcast(ShufVT, IdxVal == 0 ? Vec : ExtSrc);
59434 SDValue Shuffle =
59435 DAG.getNode(X86ISD::SHUF128, dl, ShufVT, Lo, Hi,
59436 getV4X86ShuffleImm8ForMask({0, 1, 2, 3}, dl, DAG));
59437 return DAG.getBitcast(OpVT, Shuffle);
59438 }
59439 }
59440 }
59441
59442 // Match concat_vector style patterns.
59443 SmallVector<SDValue, 2> SubVectorOps;
59444 if (collectConcatOps(N, SubVectorOps, DAG)) {
59445 if (SDValue Fold =
59446 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, Subtarget))
59447 return Fold;
59448
59449 // If we're inserting all zeros into the upper half, change this to
59450 // a concat with zero. We will match this to a move
59451 // with implicit upper bit zeroing during isel.
59452 // We do this here because we don't want combineConcatVectorOps to
59453 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
59454 if (SubVectorOps.size() == 2 &&
59455 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
59456 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59457 getZeroVector(OpVT, Subtarget, DAG, dl),
59458 SubVectorOps[0], DAG.getVectorIdxConstant(0, dl));
59459
59460 // Attempt to recursively combine to a shuffle.
59461 if (all_of(SubVectorOps, [](SDValue SubOp) {
59463 })) {
59464 SDValue Op(N, 0);
59465 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59466 return Res;
59467 }
59468 }
59469
59470 // If this is a broadcast insert into an upper undef, use a larger broadcast.
59471 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
59472 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
59473
59474 // If this is a broadcast load inserted into an upper undef, use a larger
59475 // broadcast load.
59476 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
59477 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
59478 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
59480 MemIntr->getMemoryVT(), MemIntr, 0, DAG);
59481 }
59482
59483 // If we're splatting the lower half subvector of a full vector load into the
59484 // upper half, attempt to create a subvector broadcast.
59485 if ((int)IdxVal == (VecNumElts / 2) &&
59486 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
59487 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
59488 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
59489 if (VecLd && SubLd &&
59491 SubLd, VecLd, SubVec.getValueSizeInBits() / 8, 0)) {
59493 SubVecVT, SubLd, 0, DAG);
59494 SDValue NewSubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT,
59495 BcastLd, DAG.getVectorIdxConstant(0, dl));
59496 DCI.CombineTo(SubLd, NewSubVec, BcastLd.getValue(1));
59497 return BcastLd;
59498 }
59499 }
59500
59501 // Attempt to constant fold (if we're not widening).
59502 if (!Vec.isUndef() && !ISD::isBuildVectorAllZeros(Vec.getNode())) {
59503 unsigned EltSizeInBits = OpVT.getScalarSizeInBits();
59504 APInt VecUndefElts, SubUndefElts;
59505 SmallVector<APInt, 16> VecEltBits, SubEltBits;
59506 if (getTargetConstantBitsFromNode(Vec, EltSizeInBits, VecUndefElts,
59507 VecEltBits) &&
59508 getTargetConstantBitsFromNode(SubVec, EltSizeInBits, SubUndefElts,
59509 SubEltBits)) {
59510 VecUndefElts.insertBits(SubUndefElts, IdxVal);
59511 llvm::copy(SubEltBits, VecEltBits.begin() + IdxVal);
59512 return getConstVector(VecEltBits, VecUndefElts, OpVT, DAG, dl);
59513 }
59514 }
59515
59516 // Attempt to recursively combine to a shuffle.
59519 SDValue Op(N, 0);
59520 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59521 return Res;
59522 }
59523
59524 // Match insertion of subvector load that perfectly aliases a base load.
59525 if ((IdxVal % SubVecNumElts) == 0 && ISD::isNormalLoad(Vec.getNode()) &&
59526 ISD::isNormalLoad(SubVec.getNode()) &&
59528 cast<LoadSDNode>(SubVec), cast<LoadSDNode>(Vec),
59529 SubVec.getValueSizeInBits() / 8, IdxVal / SubVecNumElts))
59530 return Vec;
59531
59532 return SDValue();
59533}
59534
59535/// If we are extracting a subvector of a vector select and the select condition
59536/// is composed of concatenated vectors, try to narrow the select width. This
59537/// is a common pattern for AVX1 integer code because 256-bit selects may be
59538/// legal, but there is almost no integer math/logic available for 256-bit.
59539/// This function should only be called with legal types (otherwise, the calls
59540/// to get simple value types will assert).
59542 SelectionDAG &DAG) {
59543 SDValue Sel = Ext->getOperand(0);
59544 if (Sel.getOpcode() != ISD::VSELECT ||
59545 !isFreeToSplitVector(Sel.getOperand(0), DAG))
59546 return SDValue();
59547
59548 // Note: We assume simple value types because this should only be called with
59549 // legal operations/types.
59550 // TODO: This can be extended to handle extraction to 256-bits.
59551 MVT VT = Ext->getSimpleValueType(0);
59552 if (!VT.is128BitVector())
59553 return SDValue();
59554
59555 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
59556 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
59557 return SDValue();
59558
59559 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
59560 MVT SelVT = Sel.getSimpleValueType();
59561 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
59562 "Unexpected vector type with legal operations");
59563
59564 unsigned SelElts = SelVT.getVectorNumElements();
59565 unsigned CastedElts = WideVT.getVectorNumElements();
59566 unsigned ExtIdx = Ext->getConstantOperandVal(1);
59567 if (SelElts % CastedElts == 0) {
59568 // The select has the same or more (narrower) elements than the extract
59569 // operand. The extraction index gets scaled by that factor.
59570 ExtIdx *= (SelElts / CastedElts);
59571 } else if (CastedElts % SelElts == 0) {
59572 // The select has less (wider) elements than the extract operand. Make sure
59573 // that the extraction index can be divided evenly.
59574 unsigned IndexDivisor = CastedElts / SelElts;
59575 if (ExtIdx % IndexDivisor != 0)
59576 return SDValue();
59577 ExtIdx /= IndexDivisor;
59578 } else {
59579 llvm_unreachable("Element count of simple vector types are not divisible?");
59580 }
59581
59582 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
59583 unsigned NarrowElts = SelElts / NarrowingFactor;
59584 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
59585 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
59586 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
59587 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
59588 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
59589 return DAG.getBitcast(VT, NarrowSel);
59590}
59591
59594 const X86Subtarget &Subtarget) {
59595 if (!N->getValueType(0).isSimple())
59596 return SDValue();
59597
59598 MVT VT = N->getSimpleValueType(0);
59599 SDValue InVec = N->getOperand(0);
59600 unsigned IdxVal = N->getConstantOperandVal(1);
59601 EVT InVecVT = InVec.getValueType();
59602 unsigned SizeInBits = VT.getSizeInBits();
59603 unsigned InSizeInBits = InVecVT.getSizeInBits();
59604 unsigned NumSubElts = VT.getVectorNumElements();
59605 unsigned NumInElts = InVecVT.getVectorNumElements();
59606 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59607 SDLoc DL(N);
59608
59609 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
59610 // eventually get combined/lowered into ANDNP) with a concatenated operand,
59611 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
59612 // We let generic combining take over from there to simplify the
59613 // insert/extract and 'not'.
59614 // This pattern emerges during AVX1 legalization. We handle it before lowering
59615 // to avoid complications like splitting constant vector loads.
59616 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && TLI.isTypeLegal(InVecVT) &&
59617 InSizeInBits == 256 && InVec.getOpcode() == ISD::AND) {
59618 auto isConcatenatedNot = [](SDValue V) {
59619 V = peekThroughBitcasts(V);
59620 if (!isBitwiseNot(V))
59621 return false;
59622 SDValue NotOp = V->getOperand(0);
59624 };
59625 if (isConcatenatedNot(InVec.getOperand(0)) ||
59626 isConcatenatedNot(InVec.getOperand(1))) {
59627 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
59628 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
59629 splitVectorIntBinary(InVec, DAG, DL),
59630 N->getOperand(1));
59631 }
59632 }
59633
59634 if (DCI.isBeforeLegalizeOps())
59635 return SDValue();
59636
59637 if (SDValue V = narrowExtractedVectorSelect(N, DL, DAG))
59638 return V;
59639
59641 return getZeroVector(VT, Subtarget, DAG, DL);
59642
59643 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
59644 if (VT.getScalarType() == MVT::i1)
59645 return DAG.getConstant(1, DL, VT);
59646 return getOnesVector(VT, DAG, DL);
59647 }
59648
59649 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
59650 return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
59651
59652 // EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2)
59653 if (IdxVal != 0 && InVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59654 InVec.hasOneUse() && TLI.isTypeLegal(VT) &&
59655 TLI.isTypeLegal(InVec.getOperand(0).getValueType())) {
59656 unsigned NewIdx = IdxVal + InVec.getConstantOperandVal(1);
59657 return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits);
59658 }
59659
59660 // EXTRACT_SUBVECTOR(INSERT_SUBVECTOR(SRC,SUB,C1),C2)
59661 // --> INSERT_SUBVECTOR(EXTRACT_SUBVECTOR(SRC,C2),SUB,C1-C2)
59662 // iff SUB is entirely contained in the extraction.
59663 if (VT.getVectorElementType() != MVT::i1 && TLI.isTypeLegal(VT) &&
59664 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse()) {
59665 SDValue Src = InVec.getOperand(0);
59666 SDValue Sub = InVec.getOperand(1);
59667 EVT SubVT = Sub.getValueType();
59668 uint64_t InsIdx = InVec.getConstantOperandVal(2);
59669 if (IdxVal <= InsIdx &&
59670 (IdxVal + NumSubElts) >= (InsIdx + SubVT.getVectorNumElements())) {
59671 SDValue NewSrc = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src,
59672 DAG.getVectorIdxConstant(IdxVal, DL));
59673 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewSrc, Sub,
59674 DAG.getVectorIdxConstant(InsIdx - IdxVal, DL));
59675 }
59676 }
59677
59678 // If we're extracting an upper subvector see if we'd get the same elements if
59679 // we extracted the lowest subvector instead which should allow
59680 // SimplifyDemandedVectorElts do more simplifications.
59681 if (IdxVal != 0) {
59682 bool AllEquiv = all_of(seq<unsigned>(NumSubElts), [&](unsigned I) {
59683 return IsElementEquivalent(NumInElts, InVec, InVec, I, I + IdxVal);
59684 });
59685 if (AllEquiv)
59686 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
59687 }
59688
59689 // Check if we're extracting a whole broadcasted subvector.
59690 if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
59691 auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
59692 EVT MemVT = MemIntr->getMemoryVT();
59693 if (MemVT == VT) {
59694 // If this is the only use, we can replace with a regular load (this may
59695 // have been missed by SimplifyDemandedVectorElts due to extra uses of the
59696 // memory chain).
59697 if (InVec.hasOneUse()) {
59698 SDValue Ld =
59699 DAG.getLoad(MemVT, DL, MemIntr->getChain(), MemIntr->getBasePtr(),
59700 MemIntr->getMemOperand());
59701 DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), Ld.getValue(1));
59702 return Ld;
59703 }
59704 }
59705 }
59706
59707 // Attempt to extract from the source of a shuffle vector.
59708 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
59709 SmallVector<int, 32> ShuffleMask;
59710 SmallVector<int, 32> ScaledMask;
59711 SmallVector<SDValue, 2> ShuffleInputs;
59712 unsigned NumSubVecs = InSizeInBits / SizeInBits;
59713 // Decode the shuffle mask and scale it so its shuffling subvectors.
59714 if (getTargetShuffleInputs(InVec, ShuffleInputs, ShuffleMask, DAG) &&
59715 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
59716 unsigned SubVecIdx = IdxVal / NumSubElts;
59717 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
59718 return DAG.getUNDEF(VT);
59719 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
59720 return getZeroVector(VT, Subtarget, DAG, DL);
59721 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
59722 if (Src.getValueSizeInBits() == InSizeInBits) {
59723 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
59724 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
59725 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
59726 DL, SizeInBits);
59727 }
59728 }
59729 }
59730
59731 auto IsExtractFree = [](SDValue V) {
59732 if (V.hasOneUse()) {
59734 if (V.getOpcode() == ISD::LOAD)
59735 return true;
59736 }
59737 V = peekThroughBitcasts(V);
59738 if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
59739 return true;
59741 return true;
59742 return V.isUndef();
59743 };
59744
59745 // If we're extracting the lowest subvector and we're the only user,
59746 // we may be able to perform this with a smaller vector width.
59747 unsigned InOpcode = InVec.getOpcode();
59748 if (InVec.hasOneUse()) {
59749 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
59750 // v2f64 CVTDQ2PD(v4i32).
59751 if (InOpcode == ISD::SINT_TO_FP &&
59752 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59753 return DAG.getNode(X86ISD::CVTSI2P, DL, VT, InVec.getOperand(0));
59754 }
59755 // v2f64 CVTUDQ2PD(v4i32).
59756 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
59757 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59758 return DAG.getNode(X86ISD::CVTUI2P, DL, VT, InVec.getOperand(0));
59759 }
59760 // v2f64 CVTPS2PD(v4f32).
59761 if (InOpcode == ISD::FP_EXTEND &&
59762 InVec.getOperand(0).getValueType() == MVT::v4f32) {
59763 return DAG.getNode(X86ISD::VFPEXT, DL, VT, InVec.getOperand(0));
59764 }
59765 }
59766 // v4i32 CVTPS2DQ(v4f32) / CVTPS2UDQ(v4f32).
59767 // v4f32 CVTDQ2PS(v4i32) / CVTUDQ2PS(v4i32).
59768 if ((InOpcode == ISD::FP_TO_SINT || InOpcode == ISD::SINT_TO_FP ||
59769 ((InOpcode == ISD::FP_TO_UINT || InOpcode == ISD::UINT_TO_FP) &&
59770 Subtarget.hasVLX())) &&
59771 (VT == MVT::v4i32 || VT == MVT::v4f32)) {
59772 SDValue Src = InVec.getOperand(0);
59773 if (Src.getValueType().getScalarSizeInBits() == 32)
59774 return DAG.getNode(InOpcode, DL, VT,
59775 extractSubVector(Src, IdxVal, DAG, DL, SizeInBits));
59776 }
59777 if (IdxVal == 0 &&
59778 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
59779 (SizeInBits == 128 || SizeInBits == 256) &&
59780 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
59781 SDValue Ext = InVec.getOperand(0);
59782 if (Ext.getValueSizeInBits() > SizeInBits)
59783 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
59784 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
59785 return DAG.getNode(ExtOp, DL, VT, Ext);
59786 }
59787 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
59788 InVec.getOperand(0).getValueType().is256BitVector() &&
59789 InVec.getOperand(1).getValueType().is256BitVector() &&
59790 InVec.getOperand(2).getValueType().is256BitVector()) {
59791 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
59792 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
59793 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
59794 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
59795 }
59796 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
59797 (SizeInBits == 128 || SizeInBits == 256)) {
59798 SDValue InVecSrc = InVec.getOperand(0);
59799 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
59800 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
59801 return DAG.getNode(InOpcode, DL, VT, Ext);
59802 }
59803
59804 if (SizeInBits == 128 || SizeInBits == 256) {
59805 switch (InOpcode) {
59806 case X86ISD::MOVDDUP:
59807 return DAG.getNode(
59808 InOpcode, DL, VT,
59809 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits));
59810 case X86ISD::PSHUFD:
59811 case X86ISD::VPERMILPI:
59812 if (InVec.getOperand(0).hasOneUse()) {
59813 uint64_t M = InVec.getConstantOperandVal(1) & 255;
59814 M = VT.getScalarSizeInBits() < 64 ? M : (M >> IdxVal);
59815 return DAG.getNode(InOpcode, DL, VT,
59816 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59817 DL, SizeInBits),
59818 DAG.getTargetConstant(M, DL, MVT::i8));
59819 }
59820 break;
59821 case X86ISD::PCMPEQ:
59822 case X86ISD::PCMPGT:
59823 case X86ISD::UNPCKH:
59824 case X86ISD::UNPCKL:
59825 if (IsExtractFree(InVec.getOperand(0)) ||
59826 IsExtractFree(InVec.getOperand(1)))
59827 return DAG.getNode(InOpcode, DL, VT,
59828 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59829 DL, SizeInBits),
59830 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59831 DL, SizeInBits));
59832 break;
59833 case X86ISD::CMPP:
59834 if (IsExtractFree(InVec.getOperand(0)) ||
59835 IsExtractFree(InVec.getOperand(1)))
59836 return DAG.getNode(InOpcode, DL, VT,
59837 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59838 DL, SizeInBits),
59839 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59840 DL, SizeInBits),
59841 InVec.getOperand(2));
59842 break;
59843 case X86ISD::BLENDI:
59844 if (IsExtractFree(InVec.getOperand(0)) ||
59845 IsExtractFree(InVec.getOperand(1))) {
59846 uint64_t M = InVec.getConstantOperandVal(2) & 255;
59847 M = VT.getScalarType() == MVT::i16 ? M : (M >> IdxVal);
59848 return DAG.getNode(InOpcode, DL, VT,
59849 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59850 DL, SizeInBits),
59851 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59852 DL, SizeInBits),
59853 DAG.getTargetConstant(M, DL, MVT::i8));
59854 }
59855 break;
59856 case X86ISD::VPERMV:
59857 if (IdxVal != 0) {
59858 SDValue Mask = InVec.getOperand(0);
59859 SDValue Src = InVec.getOperand(1);
59860 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
59861 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
59862 DL, InSizeInBits);
59863 SDValue Shuffle = DAG.getNode(InOpcode, DL, InVecVT, Mask, Src);
59864 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
59865 }
59866 break;
59867 case X86ISD::VPERMV3:
59868 if (IdxVal != 0) {
59869 SDValue Src0 = InVec.getOperand(0);
59870 SDValue Mask = InVec.getOperand(1);
59871 SDValue Src1 = InVec.getOperand(2);
59872 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
59873 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
59874 DL, InSizeInBits);
59875 SDValue Shuffle =
59876 DAG.getNode(InOpcode, DL, InVecVT, Src0, Mask, Src1);
59877 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
59878 }
59879 break;
59880 }
59881 }
59882 }
59883
59884 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
59885 // as this is very likely to fold into a shuffle/truncation.
59886 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
59887 InVecVT.getScalarSizeInBits() == 64 &&
59888 InVec.getConstantOperandAPInt(1) == 32) {
59889 SDValue Ext =
59890 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
59891 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
59892 }
59893
59894 return SDValue();
59895}
59896
59898 const X86Subtarget &Subtarget) {
59899 using namespace SDPatternMatch;
59900 EVT VT = N->getValueType(0);
59901 SDValue Src = N->getOperand(0);
59902 SDLoc DL(N);
59903
59904 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
59905 // This occurs frequently in our masked scalar intrinsic code and our
59906 // floating point select lowering with AVX512.
59907 // TODO: SimplifyDemandedBits instead?
59908 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
59909 isOneConstant(Src.getOperand(1)))
59910 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
59911
59912 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
59913 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
59914 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
59915 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
59916 isNullConstant(Src.getOperand(1)))
59917 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
59918 Src.getOperand(1));
59919
59920 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
59921 // TODO: Move to DAGCombine/SimplifyDemandedBits?
59922 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
59923 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
59924 if (Op.getValueType() != MVT::i64)
59925 return SDValue();
59926 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
59927 if (Op.getOpcode() == Opc &&
59928 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
59929 return Op.getOperand(0);
59930 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
59931 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
59932 if (Ld->getExtensionType() == Ext &&
59933 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
59934 return Op;
59935 if (IsZeroExt) {
59936 KnownBits Known = DAG.computeKnownBits(Op);
59937 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
59938 return Op;
59939 }
59940 return SDValue();
59941 };
59942
59943 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
59944 return DAG.getBitcast(
59945 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
59946 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
59947
59948 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
59949 return DAG.getBitcast(
59950 VT,
59951 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
59952 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
59953 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
59954 }
59955
59956 if (Src.getOpcode() == ISD::BITCAST) {
59957 SDValue SrcOp = Src.getOperand(0);
59958 // Combine (v4i32 (scalar_to_vector (i32 (bitcast (float))))) to MOVD.
59959 if (VT == MVT::v4i32 && SrcOp.getValueType() == MVT::f32)
59960 return DAG.getBitcast(
59961 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, SrcOp));
59962 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (double))))) to MOVQ.
59963 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::f64)
59964 return DAG.getBitcast(
59965 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, SrcOp));
59966 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (mmx))))) to MOVQ2DQ.
59967 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::x86mmx)
59968 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, SrcOp);
59969 }
59970
59971 if (VT == MVT::v4i32) {
59972 SDValue HalfSrc;
59973 // Combine (v4i32 (scalar_to_vector (i32 (anyext (bitcast (f16))))))
59974 // to remove XMM->GPR->XMM moves.
59975 if (sd_match(Src, m_AnyExt(m_BitCast(
59976 m_AllOf(m_SpecificVT(MVT::f16), m_Value(HalfSrc))))))
59977 return DAG.getBitcast(
59978 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, HalfSrc));
59979 }
59980
59981 // See if we're broadcasting the scalar value, in which case just reuse that.
59982 // Ensure the same SDValue from the SDNode use is being used.
59983 if (VT.getScalarType() == Src.getValueType())
59984 for (SDNode *User : Src->users())
59985 if (User->getOpcode() == X86ISD::VBROADCAST &&
59986 Src == User->getOperand(0)) {
59987 unsigned SizeInBits = VT.getFixedSizeInBits();
59988 unsigned BroadcastSizeInBits =
59989 User->getValueSizeInBits(0).getFixedValue();
59990 if (BroadcastSizeInBits == SizeInBits)
59991 return SDValue(User, 0);
59992 if (BroadcastSizeInBits > SizeInBits)
59993 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
59994 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
59995 // coverage.
59996 }
59997
59998 // Check for cases where we've ended up with a scalarized shift, typically
59999 // during type legalization.
60000 switch (Src.getOpcode()) {
60001 case ISD::SHL:
60002 case ISD::SRL:
60003 case ISD::SRA:
60004 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
60005 if (supportedVectorShiftWithImm(VT, Subtarget, Src.getOpcode()) &&
60006 Src.hasOneUse()) {
60007 SDValue SrcVec =
60008 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60009 unsigned Opc = getTargetVShiftUniformOpcode(Src.getOpcode(), false);
60010 return getTargetVShiftByConstNode(Opc, DL, VT.getSimpleVT(), SrcVec,
60011 Amt->getZExtValue(), DAG);
60012 }
60013 }
60014 break;
60015 case ISD::FSHL:
60016 case ISD::FSHR:
60017 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(2))) {
60018 if (supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL) &&
60019 Src.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60020 Src.getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60021 Src.hasOneUse()) {
60022 uint64_t AmtVal =
60023 Amt->getAPIntValue().urem(Src.getScalarValueSizeInBits());
60024 SDValue SrcVec0 =
60025 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60026 SDValue SrcVec1 =
60027 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(1));
60028 return DAG.getNode(Src.getOpcode(), DL, VT, SrcVec0, SrcVec1,
60029 DAG.getConstant(AmtVal, DL, VT));
60030 }
60031 }
60032 break;
60033 }
60034
60035 return SDValue();
60036}
60037
60038// Simplify PMULDQ and PMULUDQ operations.
60041 const X86Subtarget &Subtarget) {
60042 SDValue LHS = N->getOperand(0);
60043 SDValue RHS = N->getOperand(1);
60044
60045 // Canonicalize constant to RHS.
60048 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
60049
60050 // Multiply by zero.
60051 // Don't return RHS as it may contain UNDEFs.
60052 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
60053 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
60054
60055 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
60056 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60057 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
60058 return SDValue(N, 0);
60059
60060 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
60061 // convert it to any_extend_invec, due to the LegalOperations check, do the
60062 // conversion directly to a vector shuffle manually. This exposes combine
60063 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
60064 // combineX86ShufflesRecursively on SSE4.1 targets.
60065 // FIXME: This is basically a hack around several other issues related to
60066 // ANY_EXTEND_VECTOR_INREG.
60067 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
60068 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60069 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60070 LHS.getOperand(0).getValueType() == MVT::v4i32) {
60071 SDLoc dl(N);
60072 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
60073 LHS.getOperand(0), { 0, -1, 1, -1 });
60074 LHS = DAG.getBitcast(MVT::v2i64, LHS);
60075 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60076 }
60077 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
60078 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60079 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60080 RHS.getOperand(0).getValueType() == MVT::v4i32) {
60081 SDLoc dl(N);
60082 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
60083 RHS.getOperand(0), { 0, -1, 1, -1 });
60084 RHS = DAG.getBitcast(MVT::v2i64, RHS);
60085 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60086 }
60087
60088 return SDValue();
60089}
60090
60091// Simplify VPMADDUBSW/VPMADDWD operations.
60094 MVT VT = N->getSimpleValueType(0);
60095 SDValue LHS = N->getOperand(0);
60096 SDValue RHS = N->getOperand(1);
60097 unsigned Opc = N->getOpcode();
60098 bool IsPMADDWD = Opc == X86ISD::VPMADDWD;
60100 "Unexpected PMADD opcode");
60101
60102 // Multiply by zero.
60103 // Don't return LHS/RHS as it may contain UNDEFs.
60104 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
60106 return DAG.getConstant(0, SDLoc(N), VT);
60107
60108 // Constant folding.
60109 APInt LHSUndefs, RHSUndefs;
60110 SmallVector<APInt> LHSBits, RHSBits;
60111 unsigned SrcEltBits = LHS.getScalarValueSizeInBits();
60112 unsigned DstEltBits = VT.getScalarSizeInBits();
60113 if (getTargetConstantBitsFromNode(LHS, SrcEltBits, LHSUndefs, LHSBits) &&
60114 getTargetConstantBitsFromNode(RHS, SrcEltBits, RHSUndefs, RHSBits)) {
60115 SmallVector<APInt> Result;
60116 for (unsigned I = 0, E = LHSBits.size(); I != E; I += 2) {
60117 APInt LHSLo = LHSBits[I + 0], LHSHi = LHSBits[I + 1];
60118 APInt RHSLo = RHSBits[I + 0], RHSHi = RHSBits[I + 1];
60119 LHSLo = IsPMADDWD ? LHSLo.sext(DstEltBits) : LHSLo.zext(DstEltBits);
60120 LHSHi = IsPMADDWD ? LHSHi.sext(DstEltBits) : LHSHi.zext(DstEltBits);
60121 APInt Lo = LHSLo * RHSLo.sext(DstEltBits);
60122 APInt Hi = LHSHi * RHSHi.sext(DstEltBits);
60123 APInt Res = IsPMADDWD ? (Lo + Hi) : Lo.sadd_sat(Hi);
60124 Result.push_back(Res);
60125 }
60126 return getConstVector(Result, VT, DAG, SDLoc(N));
60127 }
60128
60129 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60130 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60131 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60132 return SDValue(N, 0);
60133
60134 return SDValue();
60135}
60136
60137// Simplify VPMADD52L/VPMADD52H operations.
60140 MVT VT = N->getSimpleValueType(0);
60141 unsigned NumEltBits = VT.getScalarSizeInBits();
60142 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60143 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
60144 DCI))
60145 return SDValue(N, 0);
60146
60147 return SDValue();
60148}
60149
60152 const X86Subtarget &Subtarget) {
60153 EVT VT = N->getValueType(0);
60154 SDValue In = N->getOperand(0);
60155 unsigned Opcode = N->getOpcode();
60156 unsigned InOpcode = In.getOpcode();
60157 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60158 SDLoc DL(N);
60159
60160 // Try to merge vector loads and extend_inreg to an extload.
60161 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
60162 In.hasOneUse()) {
60163 auto *Ld = cast<LoadSDNode>(In);
60164 if (Ld->isSimple()) {
60165 MVT SVT = In.getSimpleValueType().getVectorElementType();
60168 : ISD::ZEXTLOAD;
60169 EVT MemVT = VT.changeVectorElementType(SVT);
60170 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
60171 SDValue Load = DAG.getExtLoad(
60172 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
60173 MemVT, Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
60174 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
60175 return Load;
60176 }
60177 }
60178 }
60179
60180 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
60181 if (Opcode == InOpcode)
60182 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
60183
60184 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
60185 // -> EXTEND_VECTOR_INREG(X).
60186 // TODO: Handle non-zero subvector indices.
60187 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
60188 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
60189 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
60190 In.getValueSizeInBits())
60191 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
60192
60193 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
60194 // TODO: Move to DAGCombine?
60195 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
60196 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
60197 In.getValueSizeInBits() == VT.getSizeInBits()) {
60198 unsigned NumElts = VT.getVectorNumElements();
60199 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
60200 EVT EltVT = In.getOperand(0).getValueType();
60201 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
60202 for (unsigned I = 0; I != NumElts; ++I)
60203 Elts[I * Scale] = In.getOperand(I);
60204 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
60205 }
60206
60207 // Attempt to combine as a shuffle on SSE41+ targets.
60208 if (Subtarget.hasSSE41()) {
60209 SDValue Op(N, 0);
60210 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
60211 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
60212 return Res;
60213 }
60214
60215 return SDValue();
60216}
60217
60220 EVT VT = N->getValueType(0);
60221 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60222 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
60223 return DAG.getConstant(0, SDLoc(N), VT);
60224
60225 // Fold kshiftr(extract_subvector(X,C1),C2)
60226 // --> extract_subvector(kshiftr(X,C1+C2),0)
60227 // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2)
60228 if (N->getOpcode() == X86ISD::KSHIFTR) {
60229 SDLoc DL(N);
60230 if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR ||
60231 N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) {
60232 SDValue Src = N->getOperand(0).getOperand(0);
60233 uint64_t Amt = N->getConstantOperandVal(1) +
60234 N->getOperand(0).getConstantOperandVal(1);
60235 EVT SrcVT = Src.getValueType();
60236 if (TLI.isTypeLegal(SrcVT) && Amt < SrcVT.getVectorNumElements()) {
60237 SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src,
60238 DAG.getTargetConstant(Amt, DL, MVT::i8));
60239 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift,
60240 DAG.getVectorIdxConstant(0, DL));
60241 }
60242 }
60243 }
60244
60245 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60246 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60247 return SDValue(N, 0);
60248
60249 return SDValue();
60250}
60251
60252// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
60253// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
60254// extra instructions between the conversion due to going to scalar and back.
60256 const X86Subtarget &Subtarget) {
60257 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
60258 return SDValue();
60259
60260 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
60261 return SDValue();
60262
60263 if (N->getValueType(0) != MVT::f32 ||
60264 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
60265 return SDValue();
60266
60267 SDLoc dl(N);
60268 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
60269 N->getOperand(0).getOperand(0));
60270 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
60271 DAG.getTargetConstant(4, dl, MVT::i32));
60272 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
60273 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
60274 DAG.getVectorIdxConstant(0, dl));
60275}
60276
60279 const X86Subtarget &Subtarget) {
60280 EVT VT = N->getValueType(0);
60281 bool IsStrict = N->isStrictFPOpcode();
60282 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60283 EVT SrcVT = Src.getValueType();
60284
60285 SDLoc dl(N);
60286 if (SrcVT.getScalarType() == MVT::bf16) {
60287 if (DCI.isAfterLegalizeDAG() && Src.getOpcode() == ISD::FP_ROUND &&
60288 !IsStrict && Src.getOperand(0).getValueType() == VT)
60289 return Src.getOperand(0);
60290
60291 if (!SrcVT.isVector())
60292 return SDValue();
60293
60294 assert(!IsStrict && "Strict FP doesn't support BF16");
60295 if (VT.getVectorElementType() == MVT::f64) {
60296 EVT TmpVT = VT.changeVectorElementType(MVT::f32);
60297 return DAG.getNode(ISD::FP_EXTEND, dl, VT,
60298 DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src));
60299 }
60300 assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
60301 EVT NVT = SrcVT.changeVectorElementType(MVT::i32);
60302 Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src);
60303 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src);
60304 Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT));
60305 return DAG.getBitcast(VT, Src);
60306 }
60307
60308 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60309 return SDValue();
60310
60311 if (Subtarget.hasFP16())
60312 return SDValue();
60313
60314 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
60315 return SDValue();
60316
60317 if (VT.getVectorElementType() != MVT::f32 &&
60318 VT.getVectorElementType() != MVT::f64)
60319 return SDValue();
60320
60321 unsigned NumElts = VT.getVectorNumElements();
60322 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60323 return SDValue();
60324
60325 // Convert the input to vXi16.
60326 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
60327 Src = DAG.getBitcast(IntVT, Src);
60328
60329 // Widen to at least 8 input elements.
60330 if (NumElts < 8) {
60331 unsigned NumConcats = 8 / NumElts;
60332 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
60333 : DAG.getConstant(0, dl, IntVT);
60334 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
60335 Ops[0] = Src;
60336 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
60337 }
60338
60339 // Destination is vXf32 with at least 4 elements.
60340 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
60341 std::max(4U, NumElts));
60342 SDValue Cvt, Chain;
60343 if (IsStrict) {
60344 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
60345 {N->getOperand(0), Src});
60346 Chain = Cvt.getValue(1);
60347 } else {
60348 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
60349 }
60350
60351 if (NumElts < 4) {
60352 assert(NumElts == 2 && "Unexpected size");
60353 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
60354 DAG.getVectorIdxConstant(0, dl));
60355 }
60356
60357 if (IsStrict) {
60358 // Extend to the original VT if necessary.
60359 if (Cvt.getValueType() != VT) {
60360 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
60361 {Chain, Cvt});
60362 Chain = Cvt.getValue(1);
60363 }
60364 return DAG.getMergeValues({Cvt, Chain}, dl);
60365 }
60366
60367 // Extend to the original VT if necessary.
60368 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
60369}
60370
60371// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract.
60374 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
60375 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
60376 "Unknown broadcast load type");
60377
60378 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
60379 SDValue Ptr = MemIntrin->getBasePtr();
60380 SDValue Chain = MemIntrin->getChain();
60381 EVT VT = N->getSimpleValueType(0);
60382 EVT MemVT = MemIntrin->getMemoryVT();
60383
60384 // Look at other users of our base pointer and try to find a wider broadcast.
60385 // The input chain and the size of the memory VT must match.
60386 for (SDNode *User : Ptr->users())
60387 if (User != N && User->getOpcode() == N->getOpcode() &&
60388 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
60389 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
60390 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
60391 MemVT.getSizeInBits() &&
60392 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
60394 MemIntrin->isSimple() && "Illegal broadcast load type");
60396 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
60397 VT.getSizeInBits());
60398 Extract = DAG.getBitcast(VT, Extract);
60399 Extract = DCI.CombineTo(N, Extract, SDValue(User, 1));
60400 return Extract;
60401 }
60402
60403 return SDValue();
60404}
60405
60407 const X86Subtarget &Subtarget) {
60408 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60409 return SDValue();
60410
60411 bool IsStrict = N->isStrictFPOpcode();
60412 EVT VT = N->getValueType(0);
60413 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60414 EVT SrcVT = Src.getValueType();
60415
60416 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
60417 SrcVT.getVectorElementType() != MVT::f32)
60418 return SDValue();
60419
60420 SDLoc dl(N);
60421
60422 SDValue Cvt, Chain;
60423 unsigned NumElts = VT.getVectorNumElements();
60424 if (Subtarget.hasFP16()) {
60425 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64),
60426 // v4f32 (xint_to_fp v4i64))))
60427 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64),
60428 // v8f16 (CVTXI2P v4i64)))
60429 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS &&
60430 Src.getNumOperands() == 2) {
60431 SDValue Cvt0, Cvt1;
60432 SDValue Op0 = Src.getOperand(0);
60433 SDValue Op1 = Src.getOperand(1);
60434 bool IsOp0Strict = Op0->isStrictFPOpcode();
60435 if (Op0.getOpcode() != Op1.getOpcode() ||
60436 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
60437 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
60438 return SDValue();
60439 }
60440 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
60441 if (IsStrict) {
60442 assert(IsOp0Strict && "Op0 must be strict node");
60443 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
60446 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60447 {Op0.getOperand(0), Op0.getOperand(1)});
60448 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60449 {Op1.getOperand(0), Op1.getOperand(1)});
60450 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60451 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
60452 }
60453 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
60455 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
60456 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
60457 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60458 }
60459 return SDValue();
60460 }
60461
60462 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60463 return SDValue();
60464
60465 // Widen to at least 4 input elements.
60466 if (NumElts < 4)
60467 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
60468 DAG.getConstantFP(0.0, dl, SrcVT));
60469
60470 // Destination is v8i16 with at least 8 elements.
60471 EVT CvtVT =
60472 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
60473 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
60474 if (IsStrict) {
60475 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
60476 {N->getOperand(0), Src, Rnd});
60477 Chain = Cvt.getValue(1);
60478 } else {
60479 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
60480 }
60481
60482 // Extract down to real number of elements.
60483 if (NumElts < 8) {
60485 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
60486 DAG.getVectorIdxConstant(0, dl));
60487 }
60488
60489 Cvt = DAG.getBitcast(VT, Cvt);
60490
60491 if (IsStrict)
60492 return DAG.getMergeValues({Cvt, Chain}, dl);
60493
60494 return Cvt;
60495}
60496
60498 SDValue Src = N->getOperand(0);
60499
60500 // Turn MOVDQ2Q+simple_load into an mmx load.
60501 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
60502 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
60503
60504 if (LN->isSimple()) {
60505 SDValue NewLd =
60506 DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(), LN->getBasePtr(),
60507 LN->getPointerInfo(), LN->getBaseAlign(),
60508 LN->getMemOperand()->getFlags());
60509 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
60510 return NewLd;
60511 }
60512 }
60513
60514 return SDValue();
60515}
60516
60519 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
60520 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60521 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
60522 return SDValue(N, 0);
60523
60524 return SDValue();
60525}
60526
60527// Fixup the MMX intrinsics' types: in IR they are expressed with <1 x i64>,
60528// and so SelectionDAGBuilder creates them with v1i64 types, but they need to
60529// use x86mmx instead.
60531 SDLoc dl(N);
60532
60533 bool MadeChange = false, CastReturnVal = false;
60535 for (const SDValue &Arg : N->op_values()) {
60536 if (Arg.getValueType() == MVT::v1i64) {
60537 MadeChange = true;
60538 Args.push_back(DAG.getBitcast(MVT::x86mmx, Arg));
60539 } else
60540 Args.push_back(Arg);
60541 }
60542 SDVTList VTs = N->getVTList();
60543 SDVTList NewVTs = VTs;
60544 if (VTs.NumVTs > 0 && VTs.VTs[0] == MVT::v1i64) {
60545 SmallVector<EVT> NewVTArr(ArrayRef<EVT>(VTs.VTs, VTs.NumVTs));
60546 NewVTArr[0] = MVT::x86mmx;
60547 NewVTs = DAG.getVTList(NewVTArr);
60548 MadeChange = true;
60549 CastReturnVal = true;
60550 }
60551
60552 if (MadeChange) {
60553 SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args);
60554 if (CastReturnVal) {
60556 for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i)
60557 Returns.push_back(Result.getValue(i));
60558 Returns[0] = DAG.getBitcast(MVT::v1i64, Returns[0]);
60559 return DAG.getMergeValues(Returns, dl);
60560 }
60561 return Result;
60562 }
60563 return SDValue();
60564}
60567 if (!DCI.isBeforeLegalize())
60568 return SDValue();
60569
60570 unsigned IntNo = N->getConstantOperandVal(0);
60571 const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo);
60572
60573 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60574 return FixupMMXIntrinsicTypes(N, DAG);
60575
60576 return SDValue();
60577}
60578
60581 if (!DCI.isBeforeLegalize())
60582 return SDValue();
60583
60584 unsigned IntNo = N->getConstantOperandVal(1);
60585 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60586
60587 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60588 return FixupMMXIntrinsicTypes(N, DAG);
60589
60590 return SDValue();
60591}
60592
60595 if (!DCI.isBeforeLegalize())
60596 return SDValue();
60597
60598 unsigned IntNo = N->getConstantOperandVal(1);
60599 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60600
60601 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60602 return FixupMMXIntrinsicTypes(N, DAG);
60603
60604 return SDValue();
60605}
60606
60608 DAGCombinerInfo &DCI) const {
60609 SelectionDAG &DAG = DCI.DAG;
60610 switch (N->getOpcode()) {
60611 // clang-format off
60612 default: break;
60614 return combineSCALAR_TO_VECTOR(N, DAG, Subtarget);
60616 case X86ISD::PEXTRW:
60617 case X86ISD::PEXTRB:
60618 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
60620 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
60622 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
60624 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
60625 case ISD::VSELECT:
60626 case ISD::SELECT:
60627 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
60628 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
60629 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
60630 case X86ISD::CMP: return combineCMP(N, DAG, DCI, Subtarget);
60631 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
60632 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
60633 case X86ISD::ADD:
60634 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
60635 case X86ISD::CLOAD:
60636 case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
60637 case X86ISD::SBB: return combineSBB(N, DAG);
60638 case X86ISD::ADC: return combineADC(N, DAG, DCI);
60639 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
60640 case ISD::SHL: return combineShiftLeft(N, DAG, Subtarget);
60641 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
60642 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
60643 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
60644 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
60645 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
60646 case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget);
60647 case ISD::AVGCEILS:
60648 case ISD::AVGCEILU:
60649 case ISD::AVGFLOORS:
60650 case ISD::AVGFLOORU: return combineAVG(N, DAG, DCI, Subtarget);
60651 case X86ISD::BEXTR:
60652 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
60653 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
60654 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
60655 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
60656 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
60658 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
60659 case ISD::SINT_TO_FP:
60661 return combineSIntToFP(N, DAG, DCI, Subtarget);
60662 case ISD::UINT_TO_FP:
60664 return combineUIntToFP(N, DAG, Subtarget);
60665 case ISD::FP_TO_SINT: return combineFPToSInt(N, DAG, Subtarget);
60666 case ISD::LRINT:
60667 case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget);
60668 case ISD::FADD:
60669 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
60670 case X86ISD::VFCMULC:
60671 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
60672 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
60673 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
60674 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
60675 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
60676 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
60677 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
60678 case X86ISD::FXOR:
60679 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
60680 case X86ISD::FMIN:
60681 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
60682 case ISD::FMINNUM:
60683 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
60684 case X86ISD::CVTSI2P:
60685 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
60686 case X86ISD::CVTP2SI:
60687 case X86ISD::CVTP2UI:
60689 case X86ISD::CVTTP2SI:
60691 case X86ISD::CVTTP2UI:
60692 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
60694 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
60695 case X86ISD::BT: return combineBT(N, DAG, DCI);
60696 case ISD::ANY_EXTEND:
60697 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
60698 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
60699 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
60703 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
60704 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
60705 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
60706 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
60707 case X86ISD::PACKSS:
60708 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
60709 case X86ISD::HADD:
60710 case X86ISD::HSUB:
60711 case X86ISD::FHADD:
60712 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
60713 case X86ISD::VSHL:
60714 case X86ISD::VSRA:
60715 case X86ISD::VSRL:
60716 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
60717 case X86ISD::VSHLI:
60718 case X86ISD::VSRAI:
60719 case X86ISD::VSRLI:
60720 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
60722 case X86ISD::PINSRB:
60723 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
60724 case X86ISD::SHUFP: // Handle all target specific shuffles
60725 case X86ISD::INSERTPS:
60726 case X86ISD::EXTRQI:
60727 case X86ISD::INSERTQI:
60728 case X86ISD::VALIGN:
60729 case X86ISD::PALIGNR:
60730 case X86ISD::VSHLDQ:
60731 case X86ISD::VSRLDQ:
60732 case X86ISD::BLENDI:
60733 case X86ISD::UNPCKH:
60734 case X86ISD::UNPCKL:
60735 case X86ISD::MOVHLPS:
60736 case X86ISD::MOVLHPS:
60737 case X86ISD::PSHUFB:
60738 case X86ISD::PSHUFD:
60739 case X86ISD::PSHUFHW:
60740 case X86ISD::PSHUFLW:
60741 case X86ISD::MOVSHDUP:
60742 case X86ISD::MOVSLDUP:
60743 case X86ISD::MOVDDUP:
60744 case X86ISD::MOVSS:
60745 case X86ISD::MOVSD:
60746 case X86ISD::MOVSH:
60747 case X86ISD::VBROADCAST:
60748 case X86ISD::VPPERM:
60749 case X86ISD::VPERMI:
60750 case X86ISD::VPERMV:
60751 case X86ISD::VPERMV3:
60752 case X86ISD::VPERMIL2:
60753 case X86ISD::VPERMILPI:
60754 case X86ISD::VPERMILPV:
60755 case X86ISD::VPERM2X128:
60756 case X86ISD::SHUF128:
60757 case X86ISD::VZEXT_MOVL:
60758 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
60759 case X86ISD::FMADD_RND:
60760 case X86ISD::FMSUB:
60762 case X86ISD::FMSUB_RND:
60763 case X86ISD::FNMADD:
60765 case X86ISD::FNMADD_RND:
60766 case X86ISD::FNMSUB:
60768 case X86ISD::FNMSUB_RND:
60769 case ISD::FMA:
60770 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
60773 case X86ISD::FMADDSUB:
60774 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
60775 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
60776 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
60777 case X86ISD::MGATHER:
60778 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
60779 case ISD::MGATHER:
60780 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
60781 case X86ISD::PCMPEQ:
60782 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
60783 case X86ISD::PMULDQ:
60784 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
60785 case X86ISD::VPMADDUBSW:
60786 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
60787 case X86ISD::VPMADD52L:
60788 case X86ISD::VPMADD52H: return combineVPMADD52LH(N, DAG, DCI);
60789 case X86ISD::KSHIFTL:
60790 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
60791 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
60793 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, DCI, Subtarget);
60795 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
60797 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
60798 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
60799 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
60800 case ISD::INTRINSIC_WO_CHAIN: return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
60801 case ISD::INTRINSIC_W_CHAIN: return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
60802 case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI);
60804 case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget);
60805 // clang-format on
60806 }
60807
60808 return SDValue();
60809}
60810
60812 return Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64);
60813}
60814
60815// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
60817 EVT ExtVT) const {
60818 return Subtarget.hasAVX512() || !VT.isVector();
60819}
60820
60822 if (!isTypeLegal(VT))
60823 return false;
60824
60825 // There are no vXi8 shifts.
60826 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
60827 return false;
60828
60829 // TODO: Almost no 8-bit ops are desirable because they have no actual
60830 // size/speed advantages vs. 32-bit ops, but they do have a major
60831 // potential disadvantage by causing partial register stalls.
60832 //
60833 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
60834 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
60835 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
60836 // check for a constant operand to the multiply.
60837 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
60838 return false;
60839
60840 // i16 instruction encodings are longer and some i16 instructions are slow,
60841 // so those are not desirable.
60842 if (VT == MVT::i16) {
60843 switch (Opc) {
60844 default:
60845 break;
60846 case ISD::LOAD:
60847 case ISD::SIGN_EXTEND:
60848 case ISD::ZERO_EXTEND:
60849 case ISD::ANY_EXTEND:
60850 case ISD::MUL:
60851 return false;
60852 case ISD::SHL:
60853 case ISD::SRA:
60854 case ISD::SRL:
60855 case ISD::SUB:
60856 case ISD::ADD:
60857 case ISD::AND:
60858 case ISD::OR:
60859 case ISD::XOR:
60860 // NDD instruction never has "partial register write" issue b/c it has
60861 // destination register's upper bits [63:OSIZE]) zeroed even when
60862 // OSIZE=8/16.
60863 return Subtarget.hasNDD();
60864 }
60865 }
60866
60867 // Any legal type not explicitly accounted for above here is desirable.
60868 return true;
60869}
60870
60872 SDValue Value, SDValue Addr,
60873 int JTI,
60874 SelectionDAG &DAG) const {
60875 const Module *M = DAG.getMachineFunction().getFunction().getParent();
60876 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
60877 if (IsCFProtectionSupported) {
60878 // In case control-flow branch protection is enabled, we need to add
60879 // notrack prefix to the indirect branch.
60880 // In order to do that we create NT_BRIND SDNode.
60881 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
60882 SDValue Chain = Value;
60883 // Jump table debug info is only needed if CodeView is enabled.
60885 Chain = DAG.getJumpTableDebugInfo(JTI, Chain, dl);
60886 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Chain, Addr);
60887 }
60888
60889 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
60890}
60891
60894 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
60896 EVT VT = LogicOp->getValueType(0);
60897 EVT OpVT = SETCC0->getOperand(0).getValueType();
60898 if (!VT.isInteger())
60900
60901 if (VT.isVector())
60906
60907 // Don't use `NotAnd` as even though `not` is generally shorter code size than
60908 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
60909 // `NotAnd` applies, `AddAnd` does as well.
60910 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
60911 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
60913}
60914
60916 EVT VT = Op.getValueType();
60917 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
60918 isa<ConstantSDNode>(Op.getOperand(1));
60919
60920 // i16 is legal, but undesirable since i16 instruction encodings are longer
60921 // and some i16 instructions are slow.
60922 // 8-bit multiply-by-constant can usually be expanded to something cheaper
60923 // using LEA and/or other ALU ops.
60924 if (VT != MVT::i16 && !Is8BitMulByConstant)
60925 return false;
60926
60927 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
60928 if (!Op.hasOneUse())
60929 return false;
60930 SDNode *User = *Op->user_begin();
60932 return false;
60933 auto *Ld = cast<LoadSDNode>(Load);
60934 auto *St = cast<StoreSDNode>(User);
60935 return Ld->getBasePtr() == St->getBasePtr();
60936 };
60937
60938 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
60939 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
60940 return false;
60941 if (!Op.hasOneUse())
60942 return false;
60943 SDNode *User = *Op->user_begin();
60944 if (User->getOpcode() != ISD::ATOMIC_STORE)
60945 return false;
60946 auto *Ld = cast<AtomicSDNode>(Load);
60947 auto *St = cast<AtomicSDNode>(User);
60948 return Ld->getBasePtr() == St->getBasePtr();
60949 };
60950
60951 auto IsFoldableZext = [](SDValue Op) {
60952 if (!Op.hasOneUse())
60953 return false;
60954 SDNode *User = *Op->user_begin();
60955 EVT VT = User->getValueType(0);
60956 return (User->getOpcode() == ISD::ZERO_EXTEND &&
60957 (VT == MVT::i32 || VT == MVT::i64));
60958 };
60959
60960 bool Commute = false;
60961 switch (Op.getOpcode()) {
60962 default: return false;
60963 case ISD::SIGN_EXTEND:
60964 case ISD::ZERO_EXTEND:
60965 case ISD::ANY_EXTEND:
60966 break;
60967 case ISD::SHL:
60968 case ISD::SRA:
60969 case ISD::SRL: {
60970 SDValue N0 = Op.getOperand(0);
60971 // Look out for (store (shl (load), x)).
60972 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
60973 return false;
60974 break;
60975 }
60976 case ISD::MUL:
60977 // When ZU is enabled, we prefer to not promote for MUL by a constant
60978 // when there is an opportunity to fold a zext with imulzu.
60979 if (Subtarget.hasZU() && IsFoldableZext(Op) &&
60980 (isa<ConstantSDNode>(Op.getOperand(0)) ||
60981 isa<ConstantSDNode>(Op.getOperand(1))))
60982 return false;
60983 [[fallthrough]];
60984 case ISD::ADD:
60985 case ISD::AND:
60986 case ISD::OR:
60987 case ISD::XOR:
60988 Commute = true;
60989 [[fallthrough]];
60990 case ISD::SUB: {
60991 SDValue N0 = Op.getOperand(0);
60992 SDValue N1 = Op.getOperand(1);
60993 // Avoid disabling potential load folding opportunities.
60994 if (X86::mayFoldLoad(N1, Subtarget) &&
60995 (!Commute || !isa<ConstantSDNode>(N0) ||
60996 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
60997 return false;
60998 if (X86::mayFoldLoad(N0, Subtarget) &&
60999 ((Commute && !isa<ConstantSDNode>(N1)) ||
61000 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
61001 return false;
61002 if (IsFoldableAtomicRMW(N0, Op) ||
61003 (Commute && IsFoldableAtomicRMW(N1, Op)))
61004 return false;
61005 }
61006 }
61007
61008 PVT = MVT::i32;
61009 return true;
61010}
61011
61012//===----------------------------------------------------------------------===//
61013// X86 Inline Assembly Support
61014//===----------------------------------------------------------------------===//
61015
61018 .Case("{@cca}", X86::COND_A)
61019 .Case("{@ccae}", X86::COND_AE)
61020 .Case("{@ccb}", X86::COND_B)
61021 .Case("{@ccbe}", X86::COND_BE)
61022 .Case("{@ccc}", X86::COND_B)
61023 .Case("{@cce}", X86::COND_E)
61024 .Case("{@ccz}", X86::COND_E)
61025 .Case("{@ccg}", X86::COND_G)
61026 .Case("{@ccge}", X86::COND_GE)
61027 .Case("{@ccl}", X86::COND_L)
61028 .Case("{@ccle}", X86::COND_LE)
61029 .Case("{@ccna}", X86::COND_BE)
61030 .Case("{@ccnae}", X86::COND_B)
61031 .Case("{@ccnb}", X86::COND_AE)
61032 .Case("{@ccnbe}", X86::COND_A)
61033 .Case("{@ccnc}", X86::COND_AE)
61034 .Case("{@ccne}", X86::COND_NE)
61035 .Case("{@ccnz}", X86::COND_NE)
61036 .Case("{@ccng}", X86::COND_LE)
61037 .Case("{@ccnge}", X86::COND_L)
61038 .Case("{@ccnl}", X86::COND_GE)
61039 .Case("{@ccnle}", X86::COND_G)
61040 .Case("{@ccno}", X86::COND_NO)
61041 .Case("{@ccnp}", X86::COND_NP)
61042 .Case("{@ccns}", X86::COND_NS)
61043 .Case("{@cco}", X86::COND_O)
61044 .Case("{@ccp}", X86::COND_P)
61045 .Case("{@ccs}", X86::COND_S)
61047 return Cond;
61048}
61049
61050/// Given a constraint letter, return the type of constraint for this target.
61053 if (Constraint.size() == 1) {
61054 switch (Constraint[0]) {
61055 case 'R':
61056 case 'q':
61057 case 'Q':
61058 case 'f':
61059 case 't':
61060 case 'u':
61061 case 'y':
61062 case 'x':
61063 case 'v':
61064 case 'l':
61065 case 'k': // AVX512 masking registers.
61066 return C_RegisterClass;
61067 case 'a':
61068 case 'b':
61069 case 'c':
61070 case 'd':
61071 case 'S':
61072 case 'D':
61073 case 'A':
61074 return C_Register;
61075 case 'I':
61076 case 'J':
61077 case 'K':
61078 case 'N':
61079 case 'G':
61080 case 'L':
61081 case 'M':
61082 return C_Immediate;
61083 case 'C':
61084 case 'e':
61085 case 'Z':
61086 return C_Other;
61087 default:
61088 break;
61089 }
61090 }
61091 else if (Constraint.size() == 2) {
61092 switch (Constraint[0]) {
61093 default:
61094 break;
61095 case 'W':
61096 if (Constraint[1] != 's')
61097 break;
61098 return C_Other;
61099 case 'Y':
61100 switch (Constraint[1]) {
61101 default:
61102 break;
61103 case 'z':
61104 return C_Register;
61105 case 'i':
61106 case 'm':
61107 case 'k':
61108 case 't':
61109 case '2':
61110 return C_RegisterClass;
61111 }
61112 break;
61113 case 'j':
61114 switch (Constraint[1]) {
61115 default:
61116 break;
61117 case 'r':
61118 case 'R':
61119 return C_RegisterClass;
61120 }
61121 }
61122 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
61123 return C_Other;
61124 return TargetLowering::getConstraintType(Constraint);
61125}
61126
61127/// Examine constraint type and operand type and determine a weight value.
61128/// This object must already have been set up with the operand type
61129/// and the current alternative constraint selected.
61132 AsmOperandInfo &Info, const char *Constraint) const {
61134 Value *CallOperandVal = Info.CallOperandVal;
61135 // If we don't have a value, we can't do a match,
61136 // but allow it at the lowest weight.
61137 if (!CallOperandVal)
61138 return CW_Default;
61139 Type *Ty = CallOperandVal->getType();
61140 // Look at the constraint type.
61141 switch (*Constraint) {
61142 default:
61144 [[fallthrough]];
61145 case 'R':
61146 case 'q':
61147 case 'Q':
61148 case 'a':
61149 case 'b':
61150 case 'c':
61151 case 'd':
61152 case 'S':
61153 case 'D':
61154 case 'A':
61155 if (CallOperandVal->getType()->isIntegerTy())
61156 Wt = CW_SpecificReg;
61157 break;
61158 case 'f':
61159 case 't':
61160 case 'u':
61161 if (Ty->isFloatingPointTy())
61162 Wt = CW_SpecificReg;
61163 break;
61164 case 'y':
61165 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61166 Wt = CW_SpecificReg;
61167 break;
61168 case 'Y':
61169 if (StringRef(Constraint).size() != 2)
61170 break;
61171 switch (Constraint[1]) {
61172 default:
61173 return CW_Invalid;
61174 // XMM0
61175 case 'z':
61176 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61177 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
61178 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
61179 return CW_SpecificReg;
61180 return CW_Invalid;
61181 // Conditional OpMask regs (AVX512)
61182 case 'k':
61183 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61184 return CW_Register;
61185 return CW_Invalid;
61186 // Any MMX reg
61187 case 'm':
61188 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61189 return CW_SpecificReg;
61190 return CW_Invalid;
61191 // Any SSE reg when ISA >= SSE2, same as 'x'
61192 case 'i':
61193 case 't':
61194 case '2':
61195 if (!Subtarget.hasSSE2())
61196 return CW_Invalid;
61197 break;
61198 }
61199 break;
61200 case 'j':
61201 if (StringRef(Constraint).size() != 2)
61202 break;
61203 switch (Constraint[1]) {
61204 default:
61205 return CW_Invalid;
61206 case 'r':
61207 case 'R':
61208 if (CallOperandVal->getType()->isIntegerTy())
61209 Wt = CW_SpecificReg;
61210 break;
61211 }
61212 break;
61213 case 'v':
61214 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
61215 Wt = CW_Register;
61216 [[fallthrough]];
61217 case 'x':
61218 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61219 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
61220 Wt = CW_Register;
61221 break;
61222 case 'k':
61223 // Enable conditional vector operations using %k<#> registers.
61224 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61225 Wt = CW_Register;
61226 break;
61227 case 'I':
61228 if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))
61229 if (C->getZExtValue() <= 31)
61230 Wt = CW_Constant;
61231 break;
61232 case 'J':
61233 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61234 if (C->getZExtValue() <= 63)
61235 Wt = CW_Constant;
61236 break;
61237 case 'K':
61238 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61239 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
61240 Wt = CW_Constant;
61241 break;
61242 case 'L':
61243 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61244 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
61245 Wt = CW_Constant;
61246 break;
61247 case 'M':
61248 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61249 if (C->getZExtValue() <= 3)
61250 Wt = CW_Constant;
61251 break;
61252 case 'N':
61253 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61254 if (C->getZExtValue() <= 0xff)
61255 Wt = CW_Constant;
61256 break;
61257 case 'G':
61258 case 'C':
61259 if (isa<ConstantFP>(CallOperandVal))
61260 Wt = CW_Constant;
61261 break;
61262 case 'e':
61263 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61264 if ((C->getSExtValue() >= -0x80000000LL) &&
61265 (C->getSExtValue() <= 0x7fffffffLL))
61266 Wt = CW_Constant;
61267 break;
61268 case 'Z':
61269 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61270 if (C->getZExtValue() <= 0xffffffff)
61271 Wt = CW_Constant;
61272 break;
61273 }
61274 return Wt;
61275}
61276
61277/// Try to replace an X constraint, which matches anything, with another that
61278/// has more specific requirements based on the type of the corresponding
61279/// operand.
61281LowerXConstraint(EVT ConstraintVT) const {
61282 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
61283 // 'f' like normal targets.
61284 if (ConstraintVT.isFloatingPoint()) {
61285 if (Subtarget.hasSSE1())
61286 return "x";
61287 }
61288
61289 return TargetLowering::LowerXConstraint(ConstraintVT);
61290}
61291
61292// Lower @cc targets via setcc.
61294 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
61295 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
61296 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
61297 if (Cond == X86::COND_INVALID)
61298 return SDValue();
61299 // Check that return type is valid.
61300 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
61301 OpInfo.ConstraintVT.getSizeInBits() < 8)
61302 report_fatal_error("Glue output operand is of invalid type");
61303
61304 // Get EFLAGS register. Only update chain when copyfrom is glued.
61305 if (Glue.getNode()) {
61306 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
61307 Chain = Glue.getValue(1);
61308 } else
61309 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
61310 // Extract CC code.
61311 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
61312 // Extend to 32-bits
61313 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
61314
61315 return Result;
61316}
61317
61318/// Lower the specified operand into the Ops vector.
61319/// If it is invalid, don't add anything to Ops.
61321 StringRef Constraint,
61322 std::vector<SDValue> &Ops,
61323 SelectionDAG &DAG) const {
61324 SDValue Result;
61325 char ConstraintLetter = Constraint[0];
61326 switch (ConstraintLetter) {
61327 default: break;
61328 case 'I':
61329 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61330 if (C->getZExtValue() <= 31) {
61331 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61332 Op.getValueType());
61333 break;
61334 }
61335 }
61336 return;
61337 case 'J':
61338 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61339 if (C->getZExtValue() <= 63) {
61340 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61341 Op.getValueType());
61342 break;
61343 }
61344 }
61345 return;
61346 case 'K':
61347 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61348 if (isInt<8>(C->getSExtValue())) {
61349 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61350 Op.getValueType());
61351 break;
61352 }
61353 }
61354 return;
61355 case 'L':
61356 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61357 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
61358 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
61359 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
61360 Op.getValueType());
61361 break;
61362 }
61363 }
61364 return;
61365 case 'M':
61366 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61367 if (C->getZExtValue() <= 3) {
61368 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61369 Op.getValueType());
61370 break;
61371 }
61372 }
61373 return;
61374 case 'N':
61375 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61376 if (C->getZExtValue() <= 255) {
61377 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61378 Op.getValueType());
61379 break;
61380 }
61381 }
61382 return;
61383 case 'O':
61384 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61385 if (C->getZExtValue() <= 127) {
61386 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61387 Op.getValueType());
61388 break;
61389 }
61390 }
61391 return;
61392 case 'e': {
61393 // 32-bit signed value
61394 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61396 C->getSExtValue())) {
61397 // Widen to 64 bits here to get it sign extended.
61398 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
61399 break;
61400 }
61401 // FIXME gcc accepts some relocatable values here too, but only in certain
61402 // memory models; it's complicated.
61403 }
61404 return;
61405 }
61406 case 'W': {
61407 assert(Constraint[1] == 's');
61408 // Op is a BlockAddressSDNode or a GlobalAddressSDNode with an optional
61409 // offset.
61410 if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
61411 Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
61412 BA->getValueType(0)));
61413 } else {
61414 int64_t Offset = 0;
61415 if (Op->getOpcode() == ISD::ADD &&
61416 isa<ConstantSDNode>(Op->getOperand(1))) {
61417 Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
61418 Op = Op->getOperand(0);
61419 }
61420 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61421 Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
61422 GA->getValueType(0), Offset));
61423 }
61424 return;
61425 }
61426 case 'Z': {
61427 // 32-bit unsigned value
61428 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61430 C->getZExtValue())) {
61431 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61432 Op.getValueType());
61433 break;
61434 }
61435 }
61436 // FIXME gcc accepts some relocatable values here too, but only in certain
61437 // memory models; it's complicated.
61438 return;
61439 }
61440 case 'i': {
61441 // Literal immediates are always ok.
61442 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
61443 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
61444 BooleanContent BCont = getBooleanContents(MVT::i64);
61445 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
61447 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
61448 : CST->getSExtValue();
61449 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
61450 break;
61451 }
61452
61453 // In any sort of PIC mode addresses need to be computed at runtime by
61454 // adding in a register or some sort of table lookup. These can't
61455 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
61456 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
61458 return;
61459
61460 // If we are in non-pic codegen mode, we allow the address of a global (with
61461 // an optional displacement) to be used with 'i'.
61462 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61463 // If we require an extra load to get this address, as in PIC mode, we
61464 // can't accept it.
61466 Subtarget.classifyGlobalReference(GA->getGlobal())))
61467 return;
61468 break;
61469 }
61470 }
61471
61472 if (Result.getNode()) {
61473 Ops.push_back(Result);
61474 return;
61475 }
61476 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
61477}
61478
61479/// Check if \p RC is a general purpose register class.
61480/// I.e., GR* or one of their variant.
61481static bool isGRClass(const TargetRegisterClass &RC) {
61482 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
61483 RC.hasSuperClassEq(&X86::GR16RegClass) ||
61484 RC.hasSuperClassEq(&X86::GR32RegClass) ||
61485 RC.hasSuperClassEq(&X86::GR64RegClass) ||
61486 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
61487}
61488
61489/// Check if \p RC is a vector register class.
61490/// I.e., FR* / VR* or one of their variant.
61491static bool isFRClass(const TargetRegisterClass &RC) {
61492 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
61493 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
61494 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
61495 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
61496 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
61497 RC.hasSuperClassEq(&X86::VR512RegClass);
61498}
61499
61500/// Check if \p RC is a mask register class.
61501/// I.e., VK* or one of their variant.
61502static bool isVKClass(const TargetRegisterClass &RC) {
61503 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
61504 RC.hasSuperClassEq(&X86::VK2RegClass) ||
61505 RC.hasSuperClassEq(&X86::VK4RegClass) ||
61506 RC.hasSuperClassEq(&X86::VK8RegClass) ||
61507 RC.hasSuperClassEq(&X86::VK16RegClass) ||
61508 RC.hasSuperClassEq(&X86::VK32RegClass) ||
61509 RC.hasSuperClassEq(&X86::VK64RegClass);
61510}
61511
61512static bool useEGPRInlineAsm(const X86Subtarget &Subtarget) {
61513 return Subtarget.hasEGPR() && Subtarget.useInlineAsmGPR32();
61514}
61515
61516std::pair<unsigned, const TargetRegisterClass *>
61518 StringRef Constraint,
61519 MVT VT) const {
61520 // First, see if this is a constraint that directly corresponds to an LLVM
61521 // register class.
61522 if (Constraint.size() == 1) {
61523 // GCC Constraint Letters
61524 switch (Constraint[0]) {
61525 default: break;
61526 // 'A' means [ER]AX + [ER]DX.
61527 case 'A':
61528 if (Subtarget.is64Bit())
61529 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
61530 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
61531 "Expecting 64, 32 or 16 bit subtarget");
61532 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
61533
61534 // TODO: Slight differences here in allocation order and leaving
61535 // RIP in the class. Do they matter any more here than they do
61536 // in the normal allocation?
61537 case 'k':
61538 if (Subtarget.hasAVX512()) {
61539 if (VT == MVT::v1i1 || VT == MVT::i1)
61540 return std::make_pair(0U, &X86::VK1RegClass);
61541 if (VT == MVT::v8i1 || VT == MVT::i8)
61542 return std::make_pair(0U, &X86::VK8RegClass);
61543 if (VT == MVT::v16i1 || VT == MVT::i16)
61544 return std::make_pair(0U, &X86::VK16RegClass);
61545 }
61546 if (Subtarget.hasBWI()) {
61547 if (VT == MVT::v32i1 || VT == MVT::i32)
61548 return std::make_pair(0U, &X86::VK32RegClass);
61549 if (VT == MVT::v64i1 || VT == MVT::i64)
61550 return std::make_pair(0U, &X86::VK64RegClass);
61551 }
61552 break;
61553 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
61554 if (Subtarget.is64Bit()) {
61555 if (VT == MVT::i8 || VT == MVT::i1)
61556 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61557 ? &X86::GR8RegClass
61558 : &X86::GR8_NOREX2RegClass);
61559 if (VT == MVT::i16)
61560 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61561 ? &X86::GR16RegClass
61562 : &X86::GR16_NOREX2RegClass);
61563 if (VT == MVT::i32 || VT == MVT::f32)
61564 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61565 ? &X86::GR32RegClass
61566 : &X86::GR32_NOREX2RegClass);
61567 if (VT != MVT::f80 && !VT.isVector())
61568 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61569 ? &X86::GR64RegClass
61570 : &X86::GR64_NOREX2RegClass);
61571 break;
61572 }
61573 [[fallthrough]];
61574 // 32-bit fallthrough
61575 case 'Q': // Q_REGS
61576 if (VT == MVT::i8 || VT == MVT::i1)
61577 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
61578 if (VT == MVT::i16)
61579 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
61580 if (VT == MVT::i32 || VT == MVT::f32 ||
61581 (!VT.isVector() && !Subtarget.is64Bit()))
61582 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
61583 if (VT != MVT::f80 && !VT.isVector())
61584 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
61585 break;
61586 case 'r': // GENERAL_REGS
61587 case 'l': // INDEX_REGS
61588 if (VT == MVT::i8 || VT == MVT::i1)
61589 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61590 ? &X86::GR8RegClass
61591 : &X86::GR8_NOREX2RegClass);
61592 if (VT == MVT::i16)
61593 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61594 ? &X86::GR16RegClass
61595 : &X86::GR16_NOREX2RegClass);
61596 if (VT == MVT::i32 || VT == MVT::f32 ||
61597 (!VT.isVector() && !Subtarget.is64Bit()))
61598 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61599 ? &X86::GR32RegClass
61600 : &X86::GR32_NOREX2RegClass);
61601 if (VT != MVT::f80 && !VT.isVector())
61602 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61603 ? &X86::GR64RegClass
61604 : &X86::GR64_NOREX2RegClass);
61605 break;
61606 case 'R': // LEGACY_REGS
61607 if (VT == MVT::i8 || VT == MVT::i1)
61608 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
61609 if (VT == MVT::i16)
61610 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
61611 if (VT == MVT::i32 || VT == MVT::f32 ||
61612 (!VT.isVector() && !Subtarget.is64Bit()))
61613 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
61614 if (VT != MVT::f80 && !VT.isVector())
61615 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
61616 break;
61617 case 'f': // FP Stack registers.
61618 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
61619 // value to the correct fpstack register class.
61620 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
61621 return std::make_pair(0U, &X86::RFP32RegClass);
61622 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
61623 return std::make_pair(0U, &X86::RFP64RegClass);
61624 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
61625 return std::make_pair(0U, &X86::RFP80RegClass);
61626 break;
61627 case 'y': // MMX_REGS if MMX allowed.
61628 if (!Subtarget.hasMMX()) break;
61629 return std::make_pair(0U, &X86::VR64RegClass);
61630 case 'v':
61631 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
61632 if (!Subtarget.hasSSE1()) break;
61633 bool VConstraint = (Constraint[0] == 'v');
61634
61635 switch (VT.SimpleTy) {
61636 default: break;
61637 // Scalar SSE types.
61638 case MVT::f16:
61639 if (VConstraint && Subtarget.hasFP16())
61640 return std::make_pair(0U, &X86::FR16XRegClass);
61641 break;
61642 case MVT::f32:
61643 case MVT::i32:
61644 if (VConstraint && Subtarget.hasVLX())
61645 return std::make_pair(0U, &X86::FR32XRegClass);
61646 return std::make_pair(0U, &X86::FR32RegClass);
61647 case MVT::f64:
61648 case MVT::i64:
61649 if (VConstraint && Subtarget.hasVLX())
61650 return std::make_pair(0U, &X86::FR64XRegClass);
61651 return std::make_pair(0U, &X86::FR64RegClass);
61652 case MVT::i128:
61653 if (Subtarget.is64Bit()) {
61654 if (VConstraint && Subtarget.hasVLX())
61655 return std::make_pair(0U, &X86::VR128XRegClass);
61656 return std::make_pair(0U, &X86::VR128RegClass);
61657 }
61658 break;
61659 // Vector types and fp128.
61660 case MVT::v8f16:
61661 if (!Subtarget.hasFP16())
61662 break;
61663 if (VConstraint)
61664 return std::make_pair(0U, &X86::VR128XRegClass);
61665 return std::make_pair(0U, &X86::VR128RegClass);
61666 case MVT::v8bf16:
61667 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61668 break;
61669 if (VConstraint)
61670 return std::make_pair(0U, &X86::VR128XRegClass);
61671 return std::make_pair(0U, &X86::VR128RegClass);
61672 case MVT::f128:
61673 if (!Subtarget.is64Bit())
61674 break;
61675 [[fallthrough]];
61676 case MVT::v16i8:
61677 case MVT::v8i16:
61678 case MVT::v4i32:
61679 case MVT::v2i64:
61680 case MVT::v4f32:
61681 case MVT::v2f64:
61682 if (VConstraint && Subtarget.hasVLX())
61683 return std::make_pair(0U, &X86::VR128XRegClass);
61684 return std::make_pair(0U, &X86::VR128RegClass);
61685 // AVX types.
61686 case MVT::v16f16:
61687 if (!Subtarget.hasFP16())
61688 break;
61689 if (VConstraint)
61690 return std::make_pair(0U, &X86::VR256XRegClass);
61691 return std::make_pair(0U, &X86::VR256RegClass);
61692 case MVT::v16bf16:
61693 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61694 break;
61695 if (VConstraint)
61696 return std::make_pair(0U, &X86::VR256XRegClass);
61697 return std::make_pair(0U, &X86::VR256RegClass);
61698 case MVT::v32i8:
61699 case MVT::v16i16:
61700 case MVT::v8i32:
61701 case MVT::v4i64:
61702 case MVT::v8f32:
61703 case MVT::v4f64:
61704 if (VConstraint && Subtarget.hasVLX())
61705 return std::make_pair(0U, &X86::VR256XRegClass);
61706 if (Subtarget.hasAVX())
61707 return std::make_pair(0U, &X86::VR256RegClass);
61708 break;
61709 case MVT::v32f16:
61710 if (!Subtarget.hasFP16())
61711 break;
61712 if (VConstraint)
61713 return std::make_pair(0U, &X86::VR512RegClass);
61714 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61715 case MVT::v32bf16:
61716 if (!Subtarget.hasBF16())
61717 break;
61718 if (VConstraint)
61719 return std::make_pair(0U, &X86::VR512RegClass);
61720 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61721 case MVT::v64i8:
61722 case MVT::v32i16:
61723 case MVT::v8f64:
61724 case MVT::v16f32:
61725 case MVT::v16i32:
61726 case MVT::v8i64:
61727 if (!Subtarget.hasAVX512()) break;
61728 if (VConstraint)
61729 return std::make_pair(0U, &X86::VR512RegClass);
61730 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61731 }
61732 break;
61733 }
61734 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
61735 switch (Constraint[1]) {
61736 default:
61737 break;
61738 case 'i':
61739 case 't':
61740 case '2':
61741 return getRegForInlineAsmConstraint(TRI, "x", VT);
61742 case 'm':
61743 if (!Subtarget.hasMMX()) break;
61744 return std::make_pair(0U, &X86::VR64RegClass);
61745 case 'z':
61746 if (!Subtarget.hasSSE1()) break;
61747 switch (VT.SimpleTy) {
61748 default: break;
61749 // Scalar SSE types.
61750 case MVT::f16:
61751 if (!Subtarget.hasFP16())
61752 break;
61753 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
61754 case MVT::f32:
61755 case MVT::i32:
61756 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
61757 case MVT::f64:
61758 case MVT::i64:
61759 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
61760 case MVT::v8f16:
61761 if (!Subtarget.hasFP16())
61762 break;
61763 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61764 case MVT::v8bf16:
61765 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61766 break;
61767 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61768 case MVT::f128:
61769 case MVT::v16i8:
61770 case MVT::v8i16:
61771 case MVT::v4i32:
61772 case MVT::v2i64:
61773 case MVT::v4f32:
61774 case MVT::v2f64:
61775 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61776 // AVX types.
61777 case MVT::v16f16:
61778 if (!Subtarget.hasFP16())
61779 break;
61780 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61781 case MVT::v16bf16:
61782 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61783 break;
61784 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61785 case MVT::v32i8:
61786 case MVT::v16i16:
61787 case MVT::v8i32:
61788 case MVT::v4i64:
61789 case MVT::v8f32:
61790 case MVT::v4f64:
61791 if (Subtarget.hasAVX())
61792 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61793 break;
61794 case MVT::v32f16:
61795 if (!Subtarget.hasFP16())
61796 break;
61797 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61798 case MVT::v32bf16:
61799 if (!Subtarget.hasBF16())
61800 break;
61801 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61802 case MVT::v64i8:
61803 case MVT::v32i16:
61804 case MVT::v8f64:
61805 case MVT::v16f32:
61806 case MVT::v16i32:
61807 case MVT::v8i64:
61808 if (Subtarget.hasAVX512())
61809 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61810 break;
61811 }
61812 break;
61813 case 'k':
61814 // This register class doesn't allocate k0 for masked vector operation.
61815 if (Subtarget.hasAVX512()) {
61816 if (VT == MVT::v1i1 || VT == MVT::i1)
61817 return std::make_pair(0U, &X86::VK1WMRegClass);
61818 if (VT == MVT::v8i1 || VT == MVT::i8)
61819 return std::make_pair(0U, &X86::VK8WMRegClass);
61820 if (VT == MVT::v16i1 || VT == MVT::i16)
61821 return std::make_pair(0U, &X86::VK16WMRegClass);
61822 }
61823 if (Subtarget.hasBWI()) {
61824 if (VT == MVT::v32i1 || VT == MVT::i32)
61825 return std::make_pair(0U, &X86::VK32WMRegClass);
61826 if (VT == MVT::v64i1 || VT == MVT::i64)
61827 return std::make_pair(0U, &X86::VK64WMRegClass);
61828 }
61829 break;
61830 }
61831 } else if (Constraint.size() == 2 && Constraint[0] == 'j') {
61832 switch (Constraint[1]) {
61833 default:
61834 break;
61835 case 'r':
61836 if (VT == MVT::i8 || VT == MVT::i1)
61837 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
61838 if (VT == MVT::i16)
61839 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
61840 if (VT == MVT::i32 || VT == MVT::f32)
61841 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
61842 if (VT != MVT::f80 && !VT.isVector())
61843 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
61844 break;
61845 case 'R':
61846 if (VT == MVT::i8 || VT == MVT::i1)
61847 return std::make_pair(0U, &X86::GR8RegClass);
61848 if (VT == MVT::i16)
61849 return std::make_pair(0U, &X86::GR16RegClass);
61850 if (VT == MVT::i32 || VT == MVT::f32)
61851 return std::make_pair(0U, &X86::GR32RegClass);
61852 if (VT != MVT::f80 && !VT.isVector())
61853 return std::make_pair(0U, &X86::GR64RegClass);
61854 break;
61855 }
61856 }
61857
61858 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
61859 return std::make_pair(0U, &X86::GR32RegClass);
61860
61861 // Use the default implementation in TargetLowering to convert the register
61862 // constraint into a member of a register class.
61863 std::pair<Register, const TargetRegisterClass*> Res;
61865
61866 // Not found as a standard register?
61867 if (!Res.second) {
61868 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
61869 // to/from f80.
61870 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
61871 // Map st(0) -> st(7) -> ST0
61872 if (Constraint.size() == 7 && Constraint[0] == '{' &&
61873 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
61874 Constraint[3] == '(' &&
61875 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
61876 Constraint[5] == ')' && Constraint[6] == '}') {
61877 // st(7) is not allocatable and thus not a member of RFP80. Return
61878 // singleton class in cases where we have a reference to it.
61879 if (Constraint[4] == '7')
61880 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
61881 return std::make_pair(X86::FP0 + Constraint[4] - '0',
61882 &X86::RFP80RegClass);
61883 }
61884
61885 // GCC allows "st(0)" to be called just plain "st".
61886 if (StringRef("{st}").equals_insensitive(Constraint))
61887 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
61888 }
61889
61890 // flags -> EFLAGS
61891 if (StringRef("{flags}").equals_insensitive(Constraint))
61892 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
61893
61894 // dirflag -> DF
61895 // Only allow for clobber.
61896 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
61897 VT == MVT::Other)
61898 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
61899
61900 // fpsr -> FPSW
61901 // Only allow for clobber.
61902 if (StringRef("{fpsr}").equals_insensitive(Constraint) && VT == MVT::Other)
61903 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
61904
61905 return Res;
61906 }
61907
61908 // Make sure it isn't a register that requires 64-bit mode.
61909 if (!Subtarget.is64Bit() &&
61910 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
61911 TRI->getEncodingValue(Res.first) >= 8) {
61912 // Register requires REX prefix, but we're in 32-bit mode.
61913 return std::make_pair(0, nullptr);
61914 }
61915
61916 // Make sure it isn't a register that requires AVX512.
61917 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
61918 TRI->getEncodingValue(Res.first) & 0x10) {
61919 // Register requires EVEX prefix.
61920 return std::make_pair(0, nullptr);
61921 }
61922
61923 // Otherwise, check to see if this is a register class of the wrong value
61924 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
61925 // turn into {ax},{dx}.
61926 // MVT::Other is used to specify clobber names.
61927 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
61928 return Res; // Correct type already, nothing to do.
61929
61930 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
61931 // return "eax". This should even work for things like getting 64bit integer
61932 // registers when given an f64 type.
61933 const TargetRegisterClass *Class = Res.second;
61934 // The generic code will match the first register class that contains the
61935 // given register. Thus, based on the ordering of the tablegened file,
61936 // the "plain" GR classes might not come first.
61937 // Therefore, use a helper method.
61938 if (isGRClass(*Class)) {
61939 unsigned Size = VT.getSizeInBits();
61940 if (Size == 1) Size = 8;
61941 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
61942 return std::make_pair(0, nullptr);
61943 Register DestReg = getX86SubSuperRegister(Res.first, Size);
61944 if (DestReg.isValid()) {
61945 bool is64Bit = Subtarget.is64Bit();
61946 const TargetRegisterClass *RC =
61947 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
61948 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
61949 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
61950 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
61951 if (Size == 64 && !is64Bit) {
61952 // Model GCC's behavior here and select a fixed pair of 32-bit
61953 // registers.
61954 switch (DestReg) {
61955 case X86::RAX:
61956 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
61957 case X86::RDX:
61958 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
61959 case X86::RCX:
61960 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
61961 case X86::RBX:
61962 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
61963 case X86::RSI:
61964 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
61965 case X86::RDI:
61966 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
61967 case X86::RBP:
61968 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
61969 default:
61970 return std::make_pair(0, nullptr);
61971 }
61972 }
61973 if (RC && RC->contains(DestReg))
61974 return std::make_pair(DestReg, RC);
61975 return Res;
61976 }
61977 // No register found/type mismatch.
61978 return std::make_pair(0, nullptr);
61979 } else if (isFRClass(*Class)) {
61980 // Handle references to XMM physical registers that got mapped into the
61981 // wrong class. This can happen with constraints like {xmm0} where the
61982 // target independent register mapper will just pick the first match it can
61983 // find, ignoring the required type.
61984
61985 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
61986 if (VT == MVT::f16)
61987 Res.second = &X86::FR16XRegClass;
61988 else if (VT == MVT::f32 || VT == MVT::i32)
61989 Res.second = &X86::FR32XRegClass;
61990 else if (VT == MVT::f64 || VT == MVT::i64)
61991 Res.second = &X86::FR64XRegClass;
61992 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
61993 Res.second = &X86::VR128XRegClass;
61994 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
61995 Res.second = &X86::VR256XRegClass;
61996 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
61997 Res.second = &X86::VR512RegClass;
61998 else {
61999 // Type mismatch and not a clobber: Return an error;
62000 Res.first = 0;
62001 Res.second = nullptr;
62002 }
62003 } else if (isVKClass(*Class)) {
62004 if (VT == MVT::v1i1 || VT == MVT::i1)
62005 Res.second = &X86::VK1RegClass;
62006 else if (VT == MVT::v8i1 || VT == MVT::i8)
62007 Res.second = &X86::VK8RegClass;
62008 else if (VT == MVT::v16i1 || VT == MVT::i16)
62009 Res.second = &X86::VK16RegClass;
62010 else if (VT == MVT::v32i1 || VT == MVT::i32)
62011 Res.second = &X86::VK32RegClass;
62012 else if (VT == MVT::v64i1 || VT == MVT::i64)
62013 Res.second = &X86::VK64RegClass;
62014 else {
62015 // Type mismatch and not a clobber: Return an error;
62016 Res.first = 0;
62017 Res.second = nullptr;
62018 }
62019 }
62020
62021 return Res;
62022}
62023
62024bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
62025 // Integer division on x86 is expensive. However, when aggressively optimizing
62026 // for code size, we prefer to use a div instruction, as it is usually smaller
62027 // than the alternative sequence.
62028 // The exception to this is vector division. Since x86 doesn't have vector
62029 // integer division, leaving the division as-is is a loss even in terms of
62030 // size, because it will have to be scalarized, while the alternative code
62031 // sequence can be performed in vector form.
62032 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
62033 return OptSize && !VT.isVector();
62034}
62035
62036void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
62037 if (!Subtarget.is64Bit())
62038 return;
62039
62040 // Update IsSplitCSR in X86MachineFunctionInfo.
62042 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
62043 AFI->setIsSplitCSR(true);
62044}
62045
62046void X86TargetLowering::insertCopiesSplitCSR(
62047 MachineBasicBlock *Entry,
62048 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
62049 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
62050 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
62051 if (!IStart)
62052 return;
62053
62054 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
62055 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
62056 MachineBasicBlock::iterator MBBI = Entry->begin();
62057 for (const MCPhysReg *I = IStart; *I; ++I) {
62058 const TargetRegisterClass *RC = nullptr;
62059 if (X86::GR64RegClass.contains(*I))
62060 RC = &X86::GR64RegClass;
62061 else
62062 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
62063
62064 Register NewVR = MRI->createVirtualRegister(RC);
62065 // Create copy from CSR to a virtual register.
62066 // FIXME: this currently does not emit CFI pseudo-instructions, it works
62067 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
62068 // nounwind. If we want to generalize this later, we may need to emit
62069 // CFI pseudo-instructions.
62070 assert(
62071 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
62072 "Function should be nounwind in insertCopiesSplitCSR!");
62073 Entry->addLiveIn(*I);
62074 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
62075 .addReg(*I);
62076
62077 // Insert the copy-back instructions right before the terminator.
62078 for (auto *Exit : Exits)
62079 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
62080 TII->get(TargetOpcode::COPY), *I)
62081 .addReg(NewVR);
62082 }
62083}
62084
62086 return Subtarget.is64Bit();
62087}
62088
62092 const TargetInstrInfo *TII) const {
62093 assert(MBBI->isCall() && MBBI->getCFIType() &&
62094 "Invalid call instruction for a KCFI check");
62095
62096 MachineFunction &MF = *MBB.getParent();
62097 // If the call target is a memory operand, unfold it and use R11 for the
62098 // call, so KCFI_CHECK won't have to recompute the address.
62099 switch (MBBI->getOpcode()) {
62100 case X86::CALL64m:
62101 case X86::CALL64m_NT:
62102 case X86::TAILJMPm64:
62103 case X86::TAILJMPm64_REX: {
62106 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
62107 /*UnfoldStore=*/false, NewMIs))
62108 report_fatal_error("Failed to unfold memory operand for a KCFI check");
62109 for (auto *NewMI : NewMIs)
62110 MBBI = MBB.insert(OrigCall, NewMI);
62111 assert(MBBI->isCall() &&
62112 "Unexpected instruction after memory operand unfolding");
62113 if (OrigCall->shouldUpdateAdditionalCallInfo())
62114 MF.moveAdditionalCallInfo(&*OrigCall, &*MBBI);
62115 MBBI->setCFIType(MF, OrigCall->getCFIType());
62116 OrigCall->eraseFromParent();
62117 break;
62118 }
62119 default:
62120 break;
62121 }
62122
62123 MachineOperand &Target = MBBI->getOperand(0);
62124 Register TargetReg;
62125 switch (MBBI->getOpcode()) {
62126 case X86::CALL64r:
62127 case X86::CALL64r_ImpCall:
62128 case X86::CALL64r_NT:
62129 case X86::TAILJMPr64:
62130 case X86::TAILJMPr64_REX:
62131 assert(Target.isReg() && "Unexpected target operand for an indirect call");
62132 Target.setIsRenamable(false);
62133 TargetReg = Target.getReg();
62134 break;
62135 case X86::CALL64pcrel32:
62136 case X86::TAILJMPd64:
62137 assert(Target.isSymbol() && "Unexpected target operand for a direct call");
62138 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
62139 // 64-bit indirect thunk calls.
62140 assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
62141 "Unexpected register for an indirect thunk call");
62142 TargetReg = X86::R11;
62143 break;
62144 default:
62145 llvm_unreachable("Unexpected CFI call opcode");
62146 break;
62147 }
62148
62149 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
62150 .addReg(TargetReg)
62151 .addImm(MBBI->getCFIType())
62152 .getInstr();
62153}
62154
62155/// Returns true if stack probing through a function call is requested.
62159
62160/// Returns true if stack probing through inline assembly is requested.
62162
62163 // No inline stack probe for Windows, they have their own mechanism.
62164 if (Subtarget.isOSWindows() || Subtarget.isUEFI() ||
62165 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62166 return false;
62167
62168 // If the function specifically requests inline stack probes, emit them.
62169 if (MF.getFunction().hasFnAttribute("probe-stack"))
62170 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
62171 "inline-asm";
62172
62173 return false;
62174}
62175
62176/// Returns the name of the symbol used to emit stack probes or the empty
62177/// string if not applicable.
62180 // Inline Stack probes disable stack probe call
62181 if (hasInlineStackProbe(MF))
62182 return "";
62183
62184 // If the function specifically requests stack probes, emit them.
62185 if (MF.getFunction().hasFnAttribute("probe-stack"))
62186 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
62187
62188 // Generally, if we aren't on Windows, the platform ABI does not include
62189 // support for stack probes, so don't emit them.
62190 if ((!Subtarget.isOSWindows() && !Subtarget.isUEFI()) ||
62191 Subtarget.isTargetMachO() ||
62192 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62193 return "";
62194
62195 // We need a stack probe to conform to the Windows ABI. Choose the right
62196 // symbol.
62197 if (Subtarget.is64Bit())
62198 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
62199 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
62200}
62201
62202unsigned
62204 // The default stack probe size is 4096 if the function has no stackprobesize
62205 // attribute.
62206 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
62207 4096);
62208}
62209
62211 if (ML && ML->isInnermost() &&
62212 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
62215}
unsigned const MachineRegisterInfo * MRI
#define Success
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
return SDValue()
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
#define NODE_NAME_CASE(node)
constexpr LLT F64
constexpr LLT S1
AMDGPU Register Bank Select
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
#define EXPAND(Op)
Function Alias Analysis Results
BitTracker BT
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isSigned(unsigned int Opcode)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static int matchShuffleAsBitRotate(ArrayRef< int > Mask, int NumSubElts)
Try to lower a vector shuffle as a bit rotation.
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
static Value * LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctlz of V before the specified instruction IP.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:546
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
Live Register Matrix
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 128-bit LoongArch vector shuffles.
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, unsigned Depth)
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL)
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 256-bit LoongArch vector shuffles.
static void computeZeroableShuffleElements(ArrayRef< int > Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero)
Compute whether each element of a shuffle is zeroable.
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Attempts to match vector shuffle as byte rotation.
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
Attempts to match a shuffle mask against the VBSLL, VBSRL, VSLLI and VSRLI instruction.
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each sub-lane.
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc)
Return true if node is an ISD::AND or ISD::OR of two M68k::SETcc nodes each of which has no other use...
static bool hasNonFlagsUse(SDValue Op)
return true if Op has a use that doesn't just read flags.
static bool isCMOVPseudo(MachineInstr &MI)
static SDValue combineCarryThroughADD(SDValue CCR)
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG)
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
#define T1
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
static constexpr MCPhysReg SPReg
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc)
static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
Try to map an integer comparison with size > XLEN to vector instructions before type legalization spl...
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
Contains matchers for matching SelectionDAG nodes and values.
static bool isSimple(Instruction *I)
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
unsigned OpIndex
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file defines the SmallVector class.
static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
static bool Enabled
Definition Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition Debug.h:119
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
static const char LUT[]
static llvm::Type * getVectorElementType(llvm::Type *Ty)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
static KnownBits computeKnownBitsForHorizontalOperation(const Operator *I, const APInt &DemandedElts, const SimplifyQuery &Q, unsigned Depth, const function_ref< KnownBits(const KnownBits &, const KnownBits &)> KnownBitsFunc)
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &DL, unsigned VectorWidth)
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static unsigned getSUBriOpcode(bool IsLP64)
static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If both input operands of a logic op are being cast from floating-point types or FP compares,...
static bool isNoopOrBroadcastShuffleMask(ArrayRef< int > Mask)
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask)
static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget)
Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer t...
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::ANDNP nodes.
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, SDValue X, SDValue Y, SelectionDAG &DAG, bool ZeroSecondOpOnly=false)
If this is an add or subtract where one operand is produced by a cmp+setcc, then try to convert it to...
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl< SDValue > &SrcOps, SmallVectorImpl< APInt > *SrcMask=nullptr)
Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) style scalarized (associative) ...
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling)
Turns an ISD::CondCode into a value suitable for SSE floating-point mask CMPs.
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC)
static bool useEGPRInlineAsm(const X86Subtarget &Subtarget)
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If a value is a scalar FP zero or a vector FP zero (potentially including undefined elements),...
static bool matchBinaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isGRClass(const TargetRegisterClass &RC)
Check if RC is a general purpose register class.
static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl< SDValue > &Ops, SmallVectorImpl< int > &Mask, bool &IsUnary)
Calculates the shuffle mask corresponding to the target-specific opcode.
static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast operation that is extracted from a vector, try to vectorize the cast op followed ...
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG)
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1)
This is a helper function of LowerToHorizontalOp().
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG)
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat=false)
Given the output values from getHalfShuffleMask(), create a half width shuffle of extracted vectors f...
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFPToSInt(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle vector element shifts by a splat shift amount.
@ ConstantBit
@ NotConstantBit
@ NotShiftBit
static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, bool NSW)
Given a buildvector constant, return a new vector constant with each element incremented or decrement...
static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode)
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane 32-bit floating point shuffles.
static MachineBasicBlock * emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII)
Utility function to emit xbegin specifying the start of an RTM region.
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef< SDValue > Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
Given the initializing elements 'Elts' of a vector of type 'VT', see if the elements can be replaced ...
static bool scaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts, SmallVectorImpl< int > &ScaledMask)
static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LoadGlobalBaseReg=false, bool LocalDynamic=false)
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > BrMergingCcmpBias("x86-br-merging-ccmp-bias", cl::init(6), cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target " "supports conditional compare instructions."), cl::Hidden)
static APInt getExtractedDemandedElts(SDNode *N)
static SDValue combineAndMaskToShift(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If this is a zero/all-bits result that is bitwise-anded with a low bits mask.
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit integer shuffles.
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef< SDValue > Ops, ArrayRef< int > Mask, ArrayRef< const SDNode * > SrcNodes, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we are inverting an PTEST/TESTP operand, attempt to adjust the CC to avoid the inversion.
static unsigned getAltBitOpcode(unsigned Opcode)
static Constant * getConstantVector(MVT VT, ArrayRef< APInt > Bits, const APInt &Undefs, LLVMContext &C)
static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert i1-subvector to i1-vector.
static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Create a vector constant without a load.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
Helper that combines an array of subvector ops as if they were the operands of a ISD::CONCAT_VECTORS ...
static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
64-bit unsigned integer to double expansion.
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget)
static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDNodeFlags Flags=SDNodeFlags())
This function lowers a vector truncation of 'extended sign-bits' or 'extended zero-bits' values.
static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on SELECT and VSELECT nodes.
static bool isUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is undef or ...
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
static SDValue getConstVector(ArrayRef< int > Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask=false)
static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB)
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to put 128-bits into a vector > 128 bits.
static bool onlyZeroFlagUsed(SDValue Flags)
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isFreeToSplitVector(SDValue V, SelectionDAG &DAG)
static SDValue lowerShuffleAsLanePermuteAndShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one source with a lane permutatio...
static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG)
static bool isFoldableUseOfShuffle(SDNode *N)
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return (and Op, Mask) for compare instructions or (vselect Mask, Op, PreservedSrc) for others along w...
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg sign extension and X86ISD::PACKSS.
static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isShuffleMaskInputInPlace(int Input, ArrayRef< int > Mask)
Test whether the specified input (0 or 1) is in-place blended by the given mask.
static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether elements in each LaneSizeInBits lane in this shuffle mask come from multiple lanes - thi...
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
As another special case, use PSUBUS[BW] when it's profitable.
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static APInt getBLENDIBlendMask(SDValue V)
Get the expanded blend mask from a BLENDI node.
static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "test Op0,Op0", or something equivalent.
static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 128-bit lane.
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS, unsigned X86CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< unsigned > CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS, unsigned NumSignificantBitsRHS)
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static bool isShuffleFoldableLoad(SDValue)
Helper to test for a load that can be folded with x86 shuffles.
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
If both arms of a vector select are concatenated vectors, split the select, and concatenate the resul...
static SDValue lowerShuffleAsElementInsertion(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower insertion of a single element into a zero vector.
static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnpackWdShuffleMask(ArrayRef< int > Mask, MVT VT, const SelectionDAG &DAG)
static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into X86ISD::PACKUS/X86ISDPAC...
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle case where shuffle sources are coming from the same 128-bit lane and every lane can be represe...
static SDValue getSHUFPDImmForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static int getSEHRegistrationNodeSize(const Function *Fn)
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Creates an SDNode for a predicated scalar operation.
static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG)
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
If a BUILD_VECTOR's source elements all apply the same bit operation and one of their operands is con...
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth=0)
Returns the negated value if the node N flips sign of FP value.
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef< int > OriginalMask, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 16-bit integer shuffles.
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 8-bit integer shuffles.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single - truncated - integer element, coming from a scalar_to_vector/buil...
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd, bool &HasAllowContract)
Returns true iff BV builds a vector with the result equivalent to the result of ADDSUB/SUBADD operati...
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI)
Emit a sequence of two 128-bit horizontal add/sub followed by a concat_vector.
static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, MVT VT, unsigned Amt=0)
static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to fold: and (vector_shuffle<Z,...,Z> (insert_vector_elt undef, (xor X, -1), Z),...
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a bitmask instruction for a shuffle.
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) followed by unpack 256-bit.
static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 256-bit lane.
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef< int > Mask)
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
32-bit unsigned integer to float expansion.
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > ExperimentalPrefInnermostLoopAlignment("x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden)
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute from a vector of source v...
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1)
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool needCarryOrOverflowFlag(SDValue Flags)
static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
Returns a vector of specified type with all bits set.
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefLowerHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose lower half is undefined.
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Search for a combinable shuffle across a chain ending in pshufd.
static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, SDValue OpMustEq, SDValue Op, unsigned Depth)
static SDValue createPSADBW(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, X86Subtarget const &Subtarget, SelectionDAG &DAG)
Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats representing a blend.
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG)
Handle vector element shifts where the shift amount is a constant.
static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf=false)
Returns a node that packs the LHS + RHS nodes together at half width.
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG)
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue V1, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT)
static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts, bool AllowUndefs)
static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), try to vectorize the cas...
static SDValue combineAndXorSubWithBMI(SDNode *And, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
static SDValue combineX86SubCmpForFlags(SDNode *N, SDValue Flag, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerVectorCTLZ_GFNI(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getHalfShuffleMask(ArrayRef< int > Mask, MutableArrayRef< int > HalfMask, int &HalfIdx1, int &HalfIdx2)
If the input shuffle mask results in a vector that is undefined in all upper or lower half elements a...
static cl::opt< int > BrMergingBaseCostThresh("x86-br-merging-base-cost", cl::init(2), cl::desc("Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden)
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT)
static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, const SDLoc &DL)
Emit a locked operation on a stack location which does not change any memory location,...
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef< int > Mask, const APInt &Zeroable)
static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 8-lane 16-bit floating point shuffles.
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle using bit math.
static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-extending masked load, it is a scalar load and ve...
static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics with chain that return their value into registers EDX:EAX.
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI)
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If this is a dynamic select (non-constant condition) and we can match this node with one of the varia...
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a target shuffle mask is equivalent within each sub-lane.
static const char * getIndirectThunkSymbol(const X86Subtarget &Subtarget, Register Reg)
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether there are elements crossing LaneSizeInBits lanes in this shuffle mask.
static SDValue FixupMMXIntrinsicTypes(SDNode *N, SelectionDAG &DAG)
static bool isShuffleMaskInputBroadcastable(int Input, ArrayRef< int > Mask, int BroadcastableElement=0)
Test whether the specified input (0 or 1) is a broadcast/splat blended by the given mask.
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, X86::CondCode &X86CC)
Result of 'and' is compared against zero.
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsZeroOrAnyExtend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a zero extension on any microarch.
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Compute the horizontal sum of bytes in V for the elements of VT.
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG)
static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG)
static void growShuffleMask(ArrayRef< int > SrcMask, SmallVectorImpl< int > &DstMask, unsigned SrcSizeInBits, unsigned DstSizeInBits)
static SDValue lowerShuffleWithEXPAND(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static void computeInLaneShuffleMask(const ArrayRef< int > &Mask, int LaneSize, SmallVector< int > &InLaneMask)
Helper to get compute inlane shuffle mask for a complete shuffle mask.
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isX86CCSigned(X86::CondCode X86CC)
Return true if the condition is an signed comparison operation.
static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG)
static bool isUndefUpperHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose upper half is undefined.
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt=0)
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsSpecificExtension(const SDLoc &DL, MVT VT, int Scale, int Offset, unsigned ExtOpc, SDValue InputV, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle as an any/signed/zero extension.
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG)
Lower SRA_PARTS and friends, which return two i32 values and take a 2 x i32 value to shift plus a shi...
static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode)
static std::pair< SDValue, SDValue > getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs reference the same FP CMP,...
static bool isVKClass(const TargetRegisterClass &RC)
Check if RC is a mask register class.
static int canLowerByDroppingElements(ArrayRef< int > Mask, bool MatchEven, bool IsSingleInput)
Check whether a compaction lowering can be done by dropping even/odd elements and compute how many ti...
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Attempt to pre-truncate inputs to arithmetic ops if it will simplify the codegen.
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single element.
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask)
Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineAndNotIntoANDNP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to combine a shuffle into a target-specific add-sub or mul-add-sub node.
static SDValue lowerShuffleAsLanePermuteAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes as a lane permutation followed by a per-lane p...
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of 8-lane i16 shuffles.
static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG)
static bool canonicalizeShuffleMaskWithCommute(ArrayRef< int > Mask)
Helper function that returns true if the shuffle mask should be commuted to improve canonicalization.
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static SDValue splitVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SelectionDAG &DAG, const SDLoc &dl)
Break a VSETCC 256/512-bit vector into two new 128/256 ones and then concatenate the result back.
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG)
Change a vector store into a pair of half-size vector stores.
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a vector to a larger size with the same scalar type, with the new elements either zero or undef...
static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static bool isUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FANDN nodes.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC)
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendBoolVectorInReg(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break a binary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isBlendOrUndef(ArrayRef< int > Mask)
Return true if every element in Mask, is an in-place blend/select mask or is undef.
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static unsigned getV4X86ShuffleImm(ArrayRef< int > Mask)
Get a 4-lane 8-bit shuffle immediate for a mask.
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveTargetShuffleFromZeroables(SmallVectorImpl< int > &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros=true)
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert one bit to mask vector, like v16i1 or v8i1.
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle by first fixing the 128-bit lanes and then shuffling each lane.
static bool isSoftF16(T VT, const X86Subtarget &Subtarget)
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Detect vector gather/scatter index generation and convert it from being a bunch of shuffles and extra...
static bool isSingleSHUFPSMask(ArrayRef< int > Mask)
Test whether this can be lowered with a single SHUFPS instruction.
static SDValue LowerFCanonicalize(SDValue Op, SelectionDAG &DAG)
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd)
Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
static bool isX86LogicalCmp(SDValue Op)
Return true if opcode is a X86 logical comparison.
static bool isAnyInRange(ArrayRef< int > Mask, int Low, int Hi)
Return true if the value of any element in Mask falls within the specified range (L,...
static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static cl::opt< bool > WidenShift("x86-widen-shift", cl::init(true), cl::desc("Replace narrow shifts with wider shifts."), cl::Hidden)
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG)
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS=false)
Detect patterns of truncation with signed saturation: (truncate (smin ((smax (x, signed_min_of_dest_t...
const unsigned FPStateSize
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef< int > TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating point negations.
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth)
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1)
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool createShuffleMaskFromVSELECT(SmallVectorImpl< int > &Mask, SDValue Cond, bool IsBLENDV=false)
static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Return Mask with the necessary casting or extending for Mask according to MaskVT when lowering maskin...
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit floating point shuffles.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Horizontal vector math instructions may be slower than normal math with shuffles.
static bool isFRClass(const TargetRegisterClass &RC)
Check if RC is a vector register class.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool SimpleOnly)
Generic routine to split vector shuffle into half-sized shuffles.
static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue IsNOT(SDValue V, SelectionDAG &DAG)
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG)
Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl)
Return a vector logical shift node.
static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_WO_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane i32 vector shuffles.
static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer types.
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isInRange(int Val, int Low, int Hi)
Return true if Val falls within the specified range (L, H].
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Try to combine x86 target specific shuffles.
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static std::pair< SDValue, SDValue > splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG)
Helper for attempting to create a X86ISD::BT node.
static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Truncating Store with signed or unsigned saturation.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes=false)
Widen a vector input to a vector of NVT.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool ImmBlends=false)
Try to lower as a blend of elements from two inputs followed by a single-input permutation.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable)
const unsigned X87StateSize
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit integer shuffles.
static bool isUndefOrEqual(int Val, int CmpVal)
Val is the undef sentinel value or equal to the specified value.
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isTargetShuffle(unsigned Opcode)
static bool isSingleElementRepeatedMask(ArrayRef< int > Mask)
Check if the Mask consists of the same element repeated multiple times.
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG)
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShufflesRecursively(ArrayRef< SDValue > SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT, ArrayRef< int > RootMask, ArrayRef< const SDNode * > SrcNodes, unsigned Depth, unsigned MaxDepth, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Fully generic combining of x86 shuffle instructions.
static SDValue LowerIntVSETCC_AVX512(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef< int > Mask, SelectionDAG &DAG)
If we are extracting two 128-bit halves of a vector and shuffling the result, match that to a 256-bit...
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit floating point shuffles.
static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or 'fsubadd' operation accordingly...
static SDValue lowerV8I16GeneralSingleInputShuffle(const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 shuffle lowering,...
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit floating point shuffles.
static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG)
static cl::opt< int > BrMergingLikelyBias("x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden)
static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG)
static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx)
Checks whether the vector elements referenced by two shuffle masks are equivalent.
static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to match a vector shuffle as an element rotation.
static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi)
Return true if Val is undef, zero or if its value falls within the specified range (L,...
static const Constant * getTargetConstantFromBasePtr(SDValue Ptr)
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle.
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset)
static bool isUndefOrInRange(int Val, int Low, int Hi)
Return true if Val is undef or if its value falls within the specified range (L, H].
static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT)
static bool collectConcatOps(SDNode *N, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG)
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG)
static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static std::pair< Value *, BitTestKind > FindSingleBitChange(Value *V)
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG)
If we are converting a value to floating-point, try to replace scalar truncate of an extracted vector...
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef< int > Mask)
Test whether there are elements crossing 128-bit lanes in this shuffle mask.
static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "cmp Op0,Op1", or something equivalent.
static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit integer shuffles.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG)
const unsigned FPStateSizeInBits
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-truncating masked store, it is a vector extract a...
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode)
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL, SelectionDAG &DAG)
If we are extracting a subvector of a vector select and the select condition is composed of concatena...
static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNoopShuffleMask(ArrayRef< int > Mask)
Tiny helper function to identify a no-op mask.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackh operation.
static SDValue combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a byte shift sequence.
static SDValue combineFP_TO_xINT_SAT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffleVariableMask(unsigned Opcode)
static bool isLogicOp(unsigned Opcode)
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly)
static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v8i16.
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary)
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to lower as an unpack of elements from two inputs followed by a single-input permutation.
static bool canScaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts)
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG)
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return a vector_shuffle of the specified vector of zero or undef vector.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Attempt to use the vbroadcast instruction to generate a splat value from a splat BUILD_VECTOR which u...
static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG)
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative, SmallVectorImpl< int > &PostShuffleMask, bool ForceHorizOp)
Return 'true' if this vector operation is "horizontal" and return the operands for the horizontal ope...
static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl< uint64_t > &RawMask, APInt &UndefElts)
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget)
sext(add_nsw(x, C)) --> add(sext(x), C_sext) zext(add_nuw(x, C)) --> add(zext(x), C_zext) Promoting a...
static const Constant * getTargetConstantFromNode(LoadSDNode *Load)
static SDValue canonicalizeBitSelect(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canCombineAsMaskOperation(SDValue V, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a dword/qword rotation.
static SDValue lowerVECTOR_COMPRESS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isProfitableToUseFlagOp(SDValue Op)
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG)
ISD::FROUND is defined to round to nearest with ties rounding away from 0.
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
Detect patterns of truncation with unsigned saturation:
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG)
If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the low half of each source v...
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG)
Do a one-to-one translation of a ISD::CondCode to the X86-specific condition code,...
static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL, SDValue Mask)
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 512-bit x86 vector shuffles.
static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v16i8.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl< APInt > &EltBits, bool AllowWholeUndefs=true, bool AllowPartialUndefs=false)
static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, SDValue &Op1)
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Depending on uarch and/or optimizing for size, we might prefer to use a vector operation in place of ...
static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, Align &Alignment, unsigned &Offset)
Given a masked memory load/store operation, return true if it has one mask bit set.
static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
When the operands of vector mul are extended from smaller size values, like i8 and i16,...
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG)
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned, const X86Subtarget &Subtarget)
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, unsigned &LogBias, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute)
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG)
The only differences between FABS and FNEG are the mask and the logic op.
ShrinkMode
Different mul shrinking modes.
static SDValue combineVPMADD52LH(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl)
static SDValue combineINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue canonicalizeShuffleMaskWithHorizOp(MutableArrayRef< SDValue > Ops, MutableArrayRef< int > Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a VSELECT instruction to a vector shuffle.
static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef< int > Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask)
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, const SDLoc &DL)
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL)
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackl operation.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG)
Try to get a scalar value for a specific element of a vector.
static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc)
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of v16i8 shuffles.
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDNodeFlags Flags=SDNodeFlags())
Helper to determine if In truncated to DstVT has the necessary signbits / leading zero bits to be tru...
static unsigned getSHUFPDImm(ArrayRef< int > Mask)
static bool isNullFPScalarOrVectorConst(SDValue V)
static bool hasIdenticalHalvesShuffleMask(ArrayRef< int > Mask)
Return true if a shuffle mask chooses elements identically in its top and bottom halves.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef< int > TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages=1)
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget)
static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth)
static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to convert a vector reduction sequence composed of binops and shuffles into horizontal ops.
static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffle using X86ISD::VROTLI rotations.
static SDValue lowerShuffleAsDecomposedShuffleMerge(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic routine to decompose a shuffle and blend into independent blends and permutes.
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT)
static SDValue combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1, ArrayRef< int > BlendMask, const APInt &DemandedElts, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine: (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) to: (brcond/cmov/setcc ....
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize an EFLAGS definition used according to the condition code CC into a simpler EFLAGS value,...
static bool isBroadcastShuffleMask(ArrayRef< int > Mask)
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXor1SetCC(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
static SDValue MatchVectorAllEqualTest(SDValue OrigLHS, SDValue OrigRHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static StringRef getInstrStrFromOpNo(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo)
static bool isSequentialOrUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size,...
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Either split a vector in halves or decompose the shuffles and the blend/unpack.
static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG)
static bool canWidenShuffleElements(ArrayRef< int > Mask, SmallVectorImpl< int > &WidenedMask)
Helper function to test whether a shuffle mask could be simplified by widening the elements being shu...
static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an unary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit integer shuffles.
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoConditionalNegate(EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth)
Returns the scalar element that will make up the i'th element of the result of the vector shuffle.
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable)
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef< int > Mask, const EVT &VectorType, bool &IsZeroSideLeft)
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV)
Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp expansion.
static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just pre-promote its result type since...
static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle as a permute of the inputs followed by an UNPCK instruction.
static SDValue combineAndOrForCcmpCtest(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SelectionDAG &DAG)
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget, TargetLowering::DAGCombinerInfo &DCI)
Extracting a scalar FP value from vector element 0 is free, so extract each operand first,...
static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isAddSubOrSubAddMask(ArrayRef< int > Mask, bool &Op0Even)
Checks if the shuffle mask takes subsequent elements alternately from two vectors.
static bool isCompletePermute(ArrayRef< int > Mask)
Return true if every element of a single input is referenced by the shuffle mask.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP)
When the MSVC runtime transfers control to us, either to an outlined function or when returning to a ...
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics that read the time stamp counter (x86_rdtsc and x86_rdtscp...
static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, ISD::CondCode CC, const APInt &OriginalMask, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static bool is128BitUnpackShuffleMask(ArrayRef< int > Mask, const SelectionDAG &DAG)
static bool isOrXorXorTree(SDValue X, bool Root=true)
Recursive helper for combineVectorSizedSetCCEquality() to see if we have a recognizable memcmp expans...
static SDValue LowerAVXExtend(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static ConstantPoolSDNode * getTargetConstantPoolFromBasePtr(SDValue Ptr)
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, const SDLoc &DL)
Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
static bool isShuffleEquivalent(ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a shuffle mask is equivalent to an explicit list of arguments.
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit floating point shuffles.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then permuting the elements of th...
static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShuffleChain(ArrayRef< SDValue > Inputs, unsigned RootOpc, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Combine an arbitrary chain of shuffles into a single instruction if possible.
static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers)
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses, bool AllowSubAddOrAddSubContract)
Returns true if is possible to fold MUL and an idiom that has already been recognized as ADDSUB/SUBAD...
static void createPackShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Unary, unsigned NumStages=1)
Create a shuffle mask that matches the PACKSS/PACKUS truncation.
static bool isUndefOrEqualInRange(ArrayRef< int > Mask, int CmpVal, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating-point adds/subs.
static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an operation into 2 half sized ops and then concatenate the results.
static cl::opt< bool > MulConstantOptimization("mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden)
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld)
static bool isAnyZero(ArrayRef< int > Mask)
Return true if the value of any element in Mask is the zero sentinel value.
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget)
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl< int > &Mask, APInt &KnownUndef, APInt &KnownZero)
static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, SelectionDAG &DAG)
static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SmallVector< int, 4 > getPSHUFShuffleMask(SDValue N)
Get the PSHUF-style mask from PSHUF node.
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG)
Scalarize a vector store, bitcasting to TargetVT to determine the scalar type.
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isUndefOrZero(int Val)
Val is either the undef or zero sentinel value.
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, F Builder, bool CheckBWI=true)
static SDValue combineAndNotOrIntoAndNotAnd(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z)) This undoes the inverse fold performed in InstCom...
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL].
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 128-bits from a vector > 128 bits.
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC)
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Lower a vector shuffle using the SHUFPS instruction.
static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShuffleChainWithExtract(ArrayRef< SDValue > Inputs, unsigned RootOpcode, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isHorizOp(unsigned Opcode)
static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector CTLZ using native supported vector CTLZ instruction.
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extract one bit from mask vector, like v16i1 or v8i1.
static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, MVT VT, bool IsSigned, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low=nullptr)
static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse)
Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the blend if only one input i...
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx)
static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG)
static SDValue combineAVG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isSequentialOrUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos + Size,...
static cl::opt< int > BrMergingUnlikelyBias("x86-br-merging-unlikely-bias", cl::init(-1), cl::desc("Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden)
static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, const SDLoc &DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, APInt &KnownUndef, APInt &KnownZero)
Decode a target shuffle mask and inputs and see if any values are known to be undef or zero from thei...
static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v4i32 or v4f32.
static bool isTargetShuffleEquivalent(MVT VT, ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, const SelectionDAG &DAG, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a target shuffle mask is equivalent to an explicit pattern.
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
CMOV of constants requires materializing constant operands in registers.
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool Is64Bit, bool Is64BitLP64)
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx)
For an EXTRACT_VECTOR_ELT with a constant index return the real underlying vector and index.
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnaryOp(unsigned Opcode)
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize branch condition evaluation.
static bool hasFPCMov(unsigned X86CC)
Is there a floating point cmov for the specific X86 condition code?
static int getOneTrueElt(SDValue V)
If V is a build vector of boolean constants and exactly one of those constants is true,...
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to turn tests against the signbit in the form of: XOR(TRUNCATE(SRL(X, size(X)-1)),...
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
if(isa< SExtInst >(LHS)) std auto IsFreeTruncation
static const unsigned FramePtr
The Input class is used to parse a yaml document into in-memory structs and vectors.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6057
static LLVM_ABI APFloat getAllOnesValue(const fltSemantics &Semantics)
Returns a float which is bitcasted from an all one value int.
Definition APFloat.cpp:6082
void clearSign()
Definition APFloat.h:1298
opStatus next(bool nextDown)
Definition APFloat.h:1254
void changeSign()
Definition APFloat.h:1297
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1079
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1406
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:449
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition APInt.h:229
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1670
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
LLVM_ABI uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition APInt.cpp:520
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1512
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:206
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
APInt abs() const
Get the absolute value.
Definition APInt.h:1795
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1666
void setSignBit()
Set the sign bit to 1.
Definition APInt.h:1340
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1111
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:209
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition APInt.h:216
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:329
bool intersects(const APInt &RHS) const
This operation tests if there are any pairs of corresponding bits between this APInt and RHS that are...
Definition APInt.h:1249
bool eq(const APInt &RHS) const
Equality comparison.
Definition APInt.h:1079
int32_t exactLogBase2() const
Definition APInt.h:1783
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1396
void ashrInPlace(unsigned ShiftAmt)
Arithmetic right-shift this APInt by ShiftAmt in place.
Definition APInt.h:834
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition APInt.h:435
unsigned getNumSignBits() const
Computes the number of leading bits of this APInt that are equal to its sign bit.
Definition APInt.h:1628
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1598
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:219
unsigned countTrailingZeros() const
Definition APInt.h:1647
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1531
void flipAllBits()
Toggle every bit to its opposite value.
Definition APInt.h:1452
unsigned countl_one() const
Count the number of leading one bits.
Definition APInt.h:1615
LLVM_ABI void insertBits(const APInt &SubBits, unsigned bitPosition)
Insert the bits from a smaller APInt starting at bitPosition.
Definition APInt.cpp:397
void clearLowBits(unsigned loBits)
Set bottom loBits bits to 0.
Definition APInt.h:1435
unsigned logBase2() const
Definition APInt.h:1761
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:827
void setAllBits()
Set every bit to 1.
Definition APInt.h:1319
bool getBoolValue() const
Convert APInt to a boolean value.
Definition APInt.h:471
bool isMask(unsigned numBits) const
Definition APInt.h:488
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition APInt.h:405
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition APInt.h:334
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1150
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1367
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition APInt.h:873
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition APInt.h:341
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:200
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1388
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition APInt.h:432
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:239
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:858
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:851
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1656
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
bool isMaxValue() const
Determine if this is the largest unsigned value.
Definition APInt.h:399
LLVM_ABI APInt truncSSat(unsigned width) const
Truncate to new width with signed saturation.
Definition APInt.cpp:973
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:183
iterator end() const
Definition ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition ArrayRef.h:206
iterator begin() const
Definition ArrayRef.h:135
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:191
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
This is an SDNode representing atomic operations.
LLVM_ABI StringRef getValueAsString() const
Return the attribute's value as a string.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
size_type count() const
count - Returns the number of bits which are set.
Definition BitVector.h:162
bool any() const
any - Returns true if any bit is set.
Definition BitVector.h:170
bool none() const
none - Returns true if none of the bits are set.
Definition BitVector.h:188
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool getRepeatedSequence(const APInt &DemandedElts, SmallVectorImpl< SDValue > &Sequence, BitVector *UndefElements=nullptr) const
Find the shortest repeating sequence of values in the build vector.
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI bool isConstant() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_SLT
signed less than
Definition InstrTypes.h:707
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:705
@ ICMP_NE
not equal
Definition InstrTypes.h:700
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:767
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static LLVM_ABI Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
const Constant * getConstVal() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
LLVM_ABI Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:165
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:229
unsigned size() const
Definition DenseMap.h:108
bool empty() const
Definition DenseMap.h:107
iterator begin()
Definition DenseMap.h:78
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:214
Tagged union holding either a T or a Error.
Definition Error.h:485
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type::subtype_iterator param_iterator
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:706
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition Function.cpp:774
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
bool hasPersonalityFn() const
Check whether this function has a personality function.
Definition Function.h:903
Constant * getPersonalityFn() const
Get the personality function associated with this function.
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727
const GlobalValue * getGlobal() const
static StringRef dropLLVMManglingEscape(StringRef Name)
If the given string begins with the GlobalValue name mangling escape character '\1',...
LLVM_ABI bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition Globals.cpp:424
ThreadLocalMode getThreadLocalMode() const
Module * getParent()
Get the module that this global value is contained inside of...
This class is used to form a handle around another node that is persistent and is updated across invo...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
Class to represent integer types.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
bool usesWindowsCFI() const
Definition MCAsmInfo.h:652
LLVM_ABI MCSymbol * getOrCreateParentFrameOffsetSymbol(const Twine &FuncName)
LLVM_ABI MCSymbol * getOrCreateLSDASymbol(const Twine &FuncName)
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
Set of metadata that should be preserved when using BuildMI().
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool bitsGE(MVT VT) const
Return true if this has no less bits than VT.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
MVT getDoubleNumVectorElementsVT() const
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
MVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
void push_back(MachineInstr *MI)
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
succ_reverse_iterator succ_rbegin()
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
succ_reverse_iterator succ_rend()
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setReturnAddressIsTaken(bool s)
void setHasCopyImplyingStackAdjustment(bool B)
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
const WinEHFuncInfo * getWinEHFuncInfo() const
getWinEHFuncInfo - Return information about how the current function uses Windows exception handling.
void moveAdditionalCallInfo(const MachineInstr *Old, const MachineInstr *New)
Move the call site info from Old to \New call site info.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
bool shouldSplitStack() const
Should we be emitting segmented stack stuff for the function.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
bool killsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr kills the specified register.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_LabelDifference32
EK_LabelDifference32 - Each entry is the address of the block minus the address of the jump table.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This class is used to represent an MGATHER node.
This is a base class used to represent MGATHER and MSCATTER nodes.
This class is used to represent an MLOAD node.
This base class is used to represent MLOAD and MSTORE nodes.
const SDValue & getMask() const
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
This class is used to represent an MSTORE node.
bool isCompressingStore() const
Returns true if the op does a compression to the vector before storing.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID for this memory operation.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Root of the metadata hierarchy.
Definition Metadata.h:63
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:353
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:303
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isValid() const
Definition Register.h:107
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
SDNode * getGluedUser() const
If this node has a glue value with a user, return the user (there is at most one).
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
SDNodeFlags getFlags() const
TypeSize getValueSizeInBits(unsigned ResNo) const
Returns MVT::getSizeInBits(getValueType(ResNo)).
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
static LLVM_ABI bool areOnlyUsersOf(ArrayRef< const SDNode * > Nodes, const SDNode *N)
Return true if all the users of N are contained in Nodes.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
virtual bool isTargetStrictFPOpcode(unsigned Opcode) const
Returns true if a node with the given target-specific opcode has strict floating-point semantics.
Help to insert SDNodeFlags automatically in transforming.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
bool willNotOverflowAdd(bool IsSigned, SDValue N0, SDValue N1) const
Determine if the result of the addition of 2 nodes can never overflow.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)
Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
LLVM_ABI SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op)
Return the specified value casted to the target's desired shift amount type.
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
LLVM_ABI SDValue getSplatSourceVector(SDValue V, int &SplatIndex)
If V is a splatted value, return the source vector and its splat index.
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI SDValue getConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offs=0, bool isT=false, unsigned TargetFlags=0)
LLVM_ABI SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getExtractSubvector(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Return the VT typed sub-vector of Vec at Idx.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getInsertSubvector(const SDLoc &DL, SDValue Vec, SDValue SubVec, unsigned Idx)
Insert SubVec at the Idx element of Vec.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
LLVM_ABI bool shouldOptForSize() const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI bool isEqualTo(SDValue A, SDValue B) const
Test whether two SDValues are known to compare equal.
static constexpr unsigned MaxRecursionDepth
LLVM_ABI SDValue expandVACopy(SDNode *Node)
Expand the specified ISD::VACOPY node as the Legalize pass would.
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI std::optional< unsigned > getValidShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has a uniform shift amount that is less than the element bit-width of the shi...
LLVM_ABI SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal)
Try to simplify a select/vselect into 1 of its operands or a constant.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue expandVAArg(SDNode *Node)
Expand the specified ISD::VAARG node as the Legalize pass would.
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
LLVM_ABI bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV)
Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to the shuffle node in input but with swa...
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly=false, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, SDNodeFlags Flags=SDNodeFlags())
LLVM_ABI std::optional< unsigned > getValidMinimumShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has shift amounts that are all less than the element bit-width of the shift n...
LLVM_ABI SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI std::pair< SDValue, SDValue > getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT)
Convert Op, which must be a STRICT operation of float type, to the float type VT, by either extending...
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI bool isKnownNeverZeroFloat(SDValue Op) const
Test whether the given floating point SDValue is known to never be positive or negative zero.
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool MaskedVectorIsZero(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Return true if 'Op' is known to be zero in DemandedElts.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue getMCSymbol(MCSymbol *Sym, EVT VT)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
SDValue getSplat(EVT VT, const SDLoc &DL, SDValue Op)
Returns a node representing a splat of one value into all lanes of the provided vector type.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
static unsigned getOpcode_EXTEND(unsigned Opcode)
Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
LLVM_ABI SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, ArrayRef< ISD::NodeType > CandidateBinOps, bool AllowPartials=false)
Match a binop + shuffle pyramid that represents a horizontal reduction over the elements of a vector ...
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
static LLVM_ABI bool isBitRotateMask(ArrayRef< int > Mask, unsigned EltSizeInBits, unsigned MinSubElts, unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt)
Checks if the shuffle is a bit rotation of the first operand across multiple subelements,...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
static int getSplatMaskIndex(ArrayRef< int > Mask)
ArrayRef< int > getMask() const
static void commuteMask(MutableArrayRef< int > Mask)
Change values in a shuffle permute mask assuming the two vector operands have swapped position.
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:181
size_type size() const
Definition SmallSet.h:170
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
pointer data()
Return a pointer to the vector's buffer, even if empty().
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:151
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:154
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:281
static constexpr size_t npos
Definition StringRef.h:57
bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
Definition StringRef.h:180
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
Information about stack frame layout on the target.
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
virtual bool hasAndNot(SDValue X) const
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
virtual bool isCommutativeBinOp(unsigned Opcode) const
Returns true if the opcode is a commutative binary operation.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const
Return true if it is profitable to fold a pair of shifts into a mask.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
BooleanContent getBooleanContents(bool isVec, bool isFloat) const
For targets without i1 registers, this gives the nature of the high-bits of boolean values held in ty...
virtual MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const
Returns preferred type for switch condition.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
BooleanContent
Enum that describes how the target represents true/false values.
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
AndOrSETCCFoldKind
Enum of different potentially desirable ways to fold (and/or (setcc ...), (setcc ....
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
NegatibleCost
Enum that specifies when a float negation is beneficial.
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual bool shouldConvertPhiType(Type *From, Type *To) const
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
static ISD::NodeType getExtendForContent(BooleanContent Content)
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const
Expands target specific indirect branch for the case of JumpTable expansion.
SDValue getCheaperNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, unsigned Depth=0) const
This is the helper function to return the newly negated expression only when the cost is cheaper.
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
Helper wrapper around SimplifyMultipleUseDemandedBits, demanding all bits from only some vector eleme...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual const char * LowerXConstraint(EVT ConstraintVT) const
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
virtual bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth=0) const
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const
Get a pointer to vector element Idx located in memory for a vector of type VecVT starting at a base a...
virtual unsigned combineRepeatedFPDivisors() const
Indicate whether this target prefers to combine FDIVs with the same divisor.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useTLSDESC() const
Returns true if this target uses TLS Descriptors.
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
unsigned NoSignedZerosFPMath
NoSignedZerosFPMath - This flag is enabled when the -enable-no-signed-zeros-fp-math is specified on t...
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetInstrInfo * getInstrInfo() const
Target - Wrapper for Target specific information.
bool isOSBinFormatCOFF() const
Tests whether the OS uses the COFF binary format.
Definition Triple.h:774
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
use_iterator use_begin()
Definition Value.h:364
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
bool Uses64BitFramePtr
True if the 64-bit frame or stack pointer should be used.
Register getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
void setAMXProgModel(AMXProgModelEnum Model)
ArrayRef< size_t > getPreallocatedArgOffsets(const size_t Id)
void setRestoreBasePointer(const MachineFunction *MF)
size_t getPreallocatedStackSize(const size_t Id)
bool hasBasePointer(const MachineFunction &MF) const
Register getPtrSizedFrameRegister(const MachineFunction &MF) const
Register getFrameRegister(const MachineFunction &MF) const override
Register getPtrSizedStackRegister(const MachineFunction &MF) const
Register getStackRegister() const
unsigned getSlotSize() const
Register getBaseRegister() const
const uint32_t * getNoPreservedMask() const override
bool canExtendTo512BW() const
bool hasAnyFMA() const
bool hasSSE1() const
bool avoidMFence() const
Avoid use of mfence forfence seq_cst, and instead use lock or.
bool hasBitScanPassThrough() const
bool hasSSE42() const
const X86TargetLowering * getTargetLowering() const override
bool hasMFence() const
Use mfence if we have SSE2 or we're on x86-64 (even if we asked for no-sse2).
bool canUseCMOV() const
bool isTargetDarwin() const
bool isTarget64BitLP64() const
Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
const X86InstrInfo * getInstrInfo() const override
bool useAVX512Regs() const
bool hasSSE3() const
bool isCallingConvWin64(CallingConv::ID CC) const
bool hasAVX512() const
bool canExtendTo512DQ() const
bool hasSSE41() const
bool hasSSE2() const
bool hasSSSE3() const
bool hasInt256() const
const X86RegisterInfo * getRegisterInfo() const override
bool hasAVX() const
unsigned getPreferVectorWidth() const
const X86FrameLowering * getFrameLowering() const override
bool useBWIRegs() const
bool hasAVX2() const
bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override
Overflow nodes should get combined/lowered to optimal instructions (they should allow eliminating exp...
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
bool isLegalAddImmediate(int64_t Imm) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool preferABDSToABSWithNSW(EVT VT) const override
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
std::pair< SDValue, SDValue > BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const
bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded vector elements, returning true on success...
SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override
Handle Lowering flag assembly outputs.
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const override
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth) const override
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
bool convertSelectOfConstantsToMath(EVT VT) const override
Return true if a select of constants (select Cond, C1, C2) should be transformed into simple math ops...
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint letter, return the type of constraint for this target.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool isLegalStoreImmediate(int64_t Imm) const override
Return true if the specified immediate is legal for the value input of a store instruction.
SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue Ptr, SDValue Val, SDValue Mask) const override
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isCtlzFast() const override
Return true if ctlz instruction is fast.
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool supportSwiftError() const override
Return true if the target supports swifterror attribute.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
bool shouldSplatInsEltVarIndex(EVT VT) const override
Return true if inserting a scalar into a variable element of an undef vector is more efficiently hand...
bool isInlineAsmTargetBranch(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo) const override
On x86, return true if the operand with index OpNo is a CALL or JUMP instruction, which can use eithe...
MVT hasFastEqualityCompare(unsigned NumBits) const override
Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, TargetLoweringOpt &TLO, unsigned Depth) const
bool isLegalICmpImmediate(int64_t Imm) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool hasInlineStackProbe(const MachineFunction &MF) const override
Returns true if stack probing through inline assembly is requested.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
unsigned preferedOpcodeForCmpEqPiecesOfOperand(EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional< APInt > &AndMask) const override
bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, EVT VT) const override
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset) const override
Return true if we believe it is correct and profitable to reduce the load node to a smaller type.
bool preferScalarizeSplat(SDNode *N) const override
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const override
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool hasAndNotCompare(SDValue Y) const override
Return true if the target should transform: (X & Y) == Y ---> (~X & Y) == 0 (X & Y) !...
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override
Return true if it is profitable to convert a select of FP constants into a constant pool load whose a...
StringRef getStackProbeSymbolName(const MachineFunction &MF) const override
Returns the name of the symbol used to emit stack probes or the empty string if not applicable.
bool hasBitTest(SDValue X, SDValue Y) const override
Return true if the target has a bit-test instruction: (X & (1 << Y)) ==/!= 0 This knowledge can be us...
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
bool isShuffleMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
bool useStackGuardXorFP() const override
If this function returns true, stack protection checks should XOR the frame pointer (or whichever poi...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine the number of bits in the operation that are sign bits.
bool shouldScalarizeBinop(SDValue) const override
Scalar ops always have equal or better analysis/performance/power than the vector equivalent,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override
Return true if it is profitable to transform an integer multiplication-by-constant into simpler opera...
bool areJTsAllowed(const Function *Fn) const override
Returns true if lowering to a jump table is allowed.
bool isCommutativeBinOp(unsigned Opcode) const override
Returns true if the opcode is a commutative binary operation.
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const override
Returns preferred type for switch condition.
SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const override
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isVectorClearMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Similar to isShuffleMaskLegal.
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info, const char *Constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Customize the preferred legalization strategy for certain types.
bool shouldConvertPhiType(Type *From, Type *To) const override
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool hasStackProbeSymbol(const MachineFunction &MF) const override
Returns true if stack probing through a function call is requested.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type Ty1 implicit zero-extends the valu...
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
This function returns true if the memory access is aligned or if the target allows this specific unal...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override
TargetLowering::AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const override
Return prefered fold type, Abs if this is a vector, AddAnd if its an integer, None otherwise.
bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override
There are two ways to clear extreme bits (either low or high): Mask: x & (-1 << y) (the instcombine c...
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool addressingModeSupportsTLS(const GlobalValue &GV) const override
Returns true if the targets addressing mode can target thread local storage (TLS).
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override
Expands target specific indirect branch for the case of JumpTable expansion.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isBinOp(unsigned Opcode) const override
Add x86-specific opcodes to the default list.
bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const override
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDValue unwrapAddress(SDValue N) const override
CondMergingParams getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const override
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the value type to use for ISD::SETCC.
X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI)
bool isTargetCanonicalSelect(SDNode *N) const override
Return true if the given select/vselect should be considered canonical and not be transformed.
bool isVectorLoadExtDesirable(SDValue) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const Constant * getTargetConstantFromLoad(LoadSDNode *LD) const override
This method returns the constant pool value that will be loaded by LD.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override
For types supported by the target, this is an identity function.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
unsigned getStackProbeSize(const MachineFunction &MF) const
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
Replace the results of node with an illegal result type with new values built out of custom code.
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
bool needsFixedCatchObjects() const override
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:194
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:169
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:134
CallInst * Call
#define INT64_MIN
Definition DataTypes.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3009
@ COND_NE
Not equal.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ X86_ThisCall
Similar to X86_StdCall.
@ X86_StdCall
stdcall is mostly used by the Win32 API.
Definition CallingConv.h:99
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ X86_FastCall
'fast' analog of X86_StdCall.
LLVM_ABI bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:163
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:525
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ FRAME_TO_ARGS_OFFSET
FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to first (possible) on-stack ar...
Definition ISDOpcodes.h:140
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:167
@ GlobalAddress
Definition ISDOpcodes.h:88
@ STRICT_FMINIMUM
Definition ISDOpcodes.h:464
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition ISDOpcodes.h:892
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:431
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ EH_RETURN
OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents 'eh_return' gcc dwarf builtin,...
Definition ISDOpcodes.h:151
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:706
@ STRICT_UINT_TO_FP
Definition ISDOpcodes.h:478
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition ISDOpcodes.h:117
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:809
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition ISDOpcodes.h:663
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ STRICT_FMAXIMUM
Definition ISDOpcodes.h:463
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition ISDOpcodes.h:130
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition ISDOpcodes.h:881
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition ISDOpcodes.h:477
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:457
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition ISDOpcodes.h:174
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:701
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:420
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition ISDOpcodes.h:236
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ExternalSymbol
Definition ISDOpcodes.h:93
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition ISDOpcodes.h:690
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:903
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:451
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:927
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:157
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:713
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
bool isExtVecInRegOpcode(unsigned Opcode)
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
LLVM_ABI bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
bool isTrueWhenEqual(CondCode Cond)
Return true if the specified condition returns true if the two operands to the condition are equal.
bool isUNINDEXEDLoad(const SDNode *N)
Returns true if the specified node is an unindexed load.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI bool isFreezeUndef(const SDNode *N)
Return true if the specified node is FREEZE(UNDEF).
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
bool matchUnaryPredicate(SDValue Op, std::function< bool(ConstantSDNode *)> Match, bool AllowUndefs=false, bool AllowTruncation=false)
Hook for matching ConstantSDNode predicate.
LLVM_ABI bool isBuildVectorOfConstantFPSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantFPSDNode or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isUnsignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs an unsigned comparison when used with intege...
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
cst_pred_ty< is_sign_mask > m_SignMask()
Match an integer or vector with only the sign bit(s) set.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
CmpClass_match< LHS, RHS, ICmpInst, true > m_c_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Xor, true > m_c_Xor(const LHS &L, const RHS &R)
Matches an Xor with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
AnyBinaryOp_match< LHS, RHS, true > m_c_BinOp(const LHS &L, const RHS &R)
Matches a BinaryOperator with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op > m_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Opcode_match m_Opc(unsigned Opcode)
BinaryOpc_match< LHS, RHS > m_Srl(const LHS &L, const RHS &R)
auto m_SpecificVT(EVT RefVT, const Pattern &P)
Match a specific ValueType.
TernaryOpc_match< LHS, RHS, IDX > m_InsertSubvector(const LHS &Base, const RHS &Sub, const IDX &Idx)
UnaryOpc_match< Opnd > m_Abs(const Opnd &Op)
Or< Preds... > m_AnyOf(const Preds &...preds)
And< Preds... > m_AllOf(const Preds &...preds)
TernaryOpc_match< T0_P, T1_P, T2_P > m_SetCC(const T0_P &LHS, const T1_P &RHS, const T2_P &CC)
UnaryOpc_match< Opnd > m_AnyExt(const Opnd &Op)
auto m_Node(unsigned Opcode, const OpndPreds &...preds)
TernaryOpc_match< T0_P, T1_P, T2_P > m_VSelect(const T0_P &Cond, const T1_P &T, const T2_P &F)
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
CondCode_match m_SpecificCondCode(ISD::CondCode CC)
Match a conditional code SDNode with a specific ISD::CondCode.
auto m_SpecificVectorElementVT(EVT RefVT, const Pattern &P)
Match a vector ValueType.
CondCode_match m_CondCode()
Match any conditional code SDNode.
ConstantInt_match m_ConstInt()
Match any integer constants or splat of an integer constant.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
Invariant opcodes: All instruction sets have these as their low opcodes.
@ X86
Windows x64, Windows Itanium (IA-64)
Definition MCAsmInfo.h:50
@ PTR32_UPTR
Definition X86.h:217
@ PTR64
Definition X86.h:218
@ PTR32_SPTR
Definition X86.h:216
@ MO_TLSLD
MO_TLSLD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
@ MO_GOTPCREL_NORELAX
MO_GOTPCREL_NORELAX - Same as MO_GOTPCREL except that R_X86_64_GOTPCREL relocations are guaranteed to...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ MO_NTPOFF
MO_NTPOFF - On a symbol operand this indicates that the immediate is the negative thread-pointer offs...
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
@ MO_TPOFF
MO_TPOFF - On a symbol operand this indicates that the immediate is the thread-pointer offset for the...
@ MO_TLVP_PIC_BASE
MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate is some TLS offset from the ...
@ MO_TLSGD
MO_TLSGD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
@ MO_TLVP
MO_TLVP - On a symbol operand this indicates that the immediate is some TLS offset.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the reference is actually to the "__imp...
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
@ MO_SECREL
MO_SECREL - On a symbol operand this indicates that the immediate is the offset from beginning of sec...
@ MO_DTPOFF
MO_DTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
@ MO_TLSLDM
MO_TLSLDM - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FMAX
Floating point max and min.
@ BT
X86 bit-test instructions.
@ HADD
Integer horizontal add/sub.
@ MOVQ2DQ
Copies a 64-bit value from an MMX vector to the low word of an XMM vector, with the high word zero fi...
@ BLENDI
Blend where the selector is an immediate.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ ADDSUB
Combined add and sub on an FP vector.
@ STRICT_FMAX
Floating point max and min.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FHADD
Floating point horizontal add/sub.
@ BSR
Bit scan reverse.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ PEXTRB
Extract an 8-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRB.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ FSETCC
X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
@ PINSRB
Insert the lower 8-bits of a 32-bit value to a vector, corresponds to X86::PINSRB.
@ INSERTPS
Insert any element of a 4 x float vector into any element of a destination 4 x floatvector.
@ PSHUFB
Shuffle 16 8-bit values within a vector.
@ PEXTRW
Extract a 16-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRW.
@ AADD
RAO arithmetic instructions.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FMAXC
Commutative FMIN and FMAX.
@ EXTRQI
SSE4A Extraction and Insertion.
@ FLD
This instruction implements an extending load to FP stack slots.
@ PSADBW
Compute Sum of Absolute Differences.
@ FOR
Bitwise logical OR of floating point values.
@ FIST
This instruction implements a fp->int store from FP stack slots.
@ FP_TO_INT_IN_MEM
This instruction implements FP_TO_SINT with the integer destination in memory and a FP reg source.
@ LADD
LOCK-prefixed arithmetic read-modify-write instructions.
@ MMX_MOVW2D
Copies a GPR into the low 32-bit word of a MMX vector and zero out the high word.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ PINSRW
Insert the lower 16-bits of a 32-bit value to a vector, corresponds to X86::PINSRW.
@ CMPCCXADD
Compare and Add if Condition is Met.
@ MMX_MOVD2W
Copies a 32-bit value from the low word of a MMX vector to a GPR.
@ FILD
This instruction implements SINT_TO_FP with the integer source in memory and FP reg result.
@ MOVDQ2Q
Copies a 64-bit value from the low word of an XMM vector to an MMX vector.
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ BSF
Bit scan forward.
@ VAARG_64
These instructions grab the address of the next argument from a va_list.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
@ FSHL
X86 funnel/double shift i16 instructions.
@ FRSQRT
Floating point reciprocal-sqrt and reciprocal approximation.
Define some predicates that are used for node matching.
@ AddrNumOperands
Definition X86BaseInfo.h:36
bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into a vector splat instruction as a memory oper...
bool isZeroNode(SDValue Elt)
Returns true if Elt is a constant zero or floating point constant +0.0.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
bool mayFoldIntoZeroExtend(SDValue Op)
Check if Op is an operation that could be folded into a zero extend x86 instruction.
bool mayFoldIntoStore(SDValue Op)
Check if Op is a value that could be used to fold a store into some other x86 instruction as a memory...
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF)
True if the target supports the extended frame for async Swift functions.
int getCCMPCondFlagsFromCondCode(CondCode CC)
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
constexpr double e
Definition MathExtras.h:47
@ User
could "use" a pointer
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl< int > &ShuffleMask)
Decode a zero extension instruction as a shuffle mask.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
static bool isGlobalStubReference(unsigned char TargetFlag)
isGlobalStubReference - Return true if the specified TargetFlag operand is a reference to a stub for ...
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:330
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:262
@ Offset
Definition DWP.cpp:477
@ Length
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
static bool isGlobalRelativeToPICBase(unsigned char TargetFlag)
isGlobalRelativeToPICBase - Return true if the specified global value reference is relative to a 32-b...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1727
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:307
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1685
void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decode a move lower and zero upper instruction as a shuffle mask.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI bool isAllOnesOrAllOnesSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant -1 integer or a splatted vector of a constant -1 integer (with...
Definition Utils.cpp:1605
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshuflw.
static const IntrinsicData * getIntrinsicWithChain(unsigned IntNo)
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2474
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:355
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
@ SjLj
setjmp/longjmp based exceptions
Definition CodeGen.h:56
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
void DecodeVPERMV3Mask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:270
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:646
void DecodeBLENDMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a BLEND immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a shuffle packed values at 128-bit granularity (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) immed...
void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for VPERMQ/VPERMPD.
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A EXTRQ instruction as a shuffle mask.
static const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
static const IntrinsicData * getIntrinsicWithoutChain(unsigned IntNo)
auto unique(Range &&R, Predicate P)
Definition STLExtras.h:2078
LLVM_ABI bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition Utils.cpp:1587
void DecodePSRLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:295
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1779
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
LLVM_ABI bool isMinSignedConstant(SDValue V)
Returns true if V is a constant min signed integer value.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask, bool SrcIsMem)
Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
static void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, Register Reg)
Replace the address used in the instruction with the direct memory reference.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:157
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:759
void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1734
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
LLVM_ABI bool getShuffleDemandedElts(int SrcWidth, ArrayRef< int > Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts=false)
Transform a shuffle mask's output demanded element mask into demanded element masks for the 2 operand...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:336
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:203
bool isAlpha(char C)
Checks if character C is a valid letter as classified by "C" locale.
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:420
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
LLVM_ABI void getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
Compute the demanded elements mask of horizontal binary operations.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI SDValue peekThroughTruncates(SDValue V)
Return the non-truncated source operand of V if it exists.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1741
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A INSERTQ instruction as a shuffle mask.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
LLVM_ABI EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
@ Default
-O2, -Os
Definition CodeGen.h:85
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
@ Success
The lock was released successfully.
void DecodeVPERMVMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
static void verifyIntrinsicTables()
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Lo)
Similar to unpacklo/unpackhi, but without the 128-bit lane limitation imposed by AVX and specific to ...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
bool isFuncletEHPersonality(EHPersonality Pers)
Returns true if this is a personality function that invokes handler funclets (which must return to it...
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
CombineLevel
Definition DAGCombine.h:15
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:1996
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
To bit_cast(const From &from) noexcept
Definition bit.h:90
void replace(R &&Range, const T &OldValue, const T &NewValue)
Provide wrappers to std::replace which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1862
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Xor
Bitwise or logical XOR of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, SmallVectorImpl< int > &ShuffleMask)
Decode a scalar float move instruction as a shuffle mask.
LLVM_ABI bool isNullConstantOrUndef(SDValue V)
Returns true if V is a constant integer zero or an UNDEF node.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1956
static X86AddressMode getAddressFromInstr(const MachineInstr *MI, unsigned Operand)
Compute the addressing mode from an machine instruction starting with the given operand.
void DecodeVPPERMMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPPERM mask from a raw array of constants such as from BUILD_VECTOR.
DWARFExpression::Operation Op
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
bool isPhysRegUsedAfter(Register Reg, MachineBasicBlock::iterator MBI)
Check if physical register Reg is used after MBI.
void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
RoundingMode
Rounding mode.
@ TowardZero
roundTowardZero.
@ NearestTiesToEven
roundTiesToEven.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1837
constexpr unsigned BitWidth
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
void DecodePSLLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1963
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1760
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
static const MachineInstrBuilder & addDirectMem(const MachineInstrBuilder &MIB, Register Reg)
addDirectMem - This function is used to add a direct memory reference to the current instruction – th...
static uint32_t extractBits(uint64_t Val, uint32_t Hi, uint32_t Lo)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1899
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
LLVM_ABI bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2110
@ TRUNCATE_TO_MEM_VI16
@ INTR_TYPE_SCALAR_MASK_SAE
@ INTR_TYPE_1OP_SAE
@ TRUNCATE_TO_MEM_VI32
@ INTR_TYPE_2OP_SAE
@ INTR_TYPE_3OP_SCALAR_MASK_SAE
@ INTR_TYPE_3OP_MASK_SAE
@ INTR_TYPE_2OP_MASK
@ TRUNCATE_TO_MEM_VI8
@ CVTNEPS2BF16_MASK
@ CMP_MASK_SCALAR_CC
@ INTR_TYPE_1OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK
@ INTR_TYPE_3OP_IMM8
@ INTR_TYPE_2OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK_RND
@ INTR_TYPE_1OP_MASK
@ COMPRESS_EXPAND_IN_REG
@ INTR_TYPE_CAST_MMX
@ INTR_TYPE_4OP_IMM8
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void array_pod_sort(IteratorTy Start, IteratorTy End)
array_pod_sort - This sorts an array with the specified start and end extent.
Definition STLExtras.h:1612
void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of the first element of a vector.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:280
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for shufp*.
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufhw.
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, Register Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
@ SM_SentinelUndef
@ SM_SentinelZero
LLVM_ABI bool scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Attempt to narrow/widen the Mask shuffle mask to the NumDstElts target width.
void DecodePSHUFBMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a PSHUFB mask from a raw array of constants such as from BUILD_VECTOR.
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
#define EQ(a, b)
Definition regexec.c:65
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition APFloat.cpp:266
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static constexpr roundingMode rmTowardZero
Definition APFloat.h:308
static LLVM_ABI const fltSemantics & x87DoubleExtended() LLVM_READNONE
Definition APFloat.cpp:289
static LLVM_ABI const fltSemantics & IEEEquad() LLVM_READNONE
Definition APFloat.cpp:268
static LLVM_ABI unsigned int semanticsPrecision(const fltSemantics &)
Definition APFloat.cpp:324
static LLVM_ABI const fltSemantics & IEEEdouble() LLVM_READNONE
Definition APFloat.cpp:267
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
static LLVM_ABI const fltSemantics & BFloat() LLVM_READNONE
Definition APFloat.cpp:265
opStatus
IEEE-754R 7: Default exception handling.
Definition APFloat.h:320
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:279
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:345
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:458
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:368
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:238
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:354
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:380
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:311
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:376
bool is512BitVector() const
Return true if this is a 512-bit vector type.
Definition ValueTypes.h:217
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:318
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:212
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:331
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:448
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:294
static LLVM_ABI KnownBits sadd_sat(const KnownBits &LHS, const KnownBits &RHS)
Compute knownbits resulting from llvm.sadd.sat(LHS, RHS)
static LLVM_ABI std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
KnownBits anyextOrTrunc(unsigned BitWidth) const
Return known bits for an "any" extension or truncation of the value we're tracking.
Definition KnownBits.h:179
static LLVM_ABI KnownBits mulhu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits from zero-extended multiply-hi.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:101
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:235
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
unsigned countMaxTrailingZeros() const
Returns the maximum number of trailing zero bits possible.
Definition KnownBits.h:267
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:154
unsigned countMaxPopulation() const
Returns the maximum number of bits that could be one.
Definition KnownBits.h:282
void setAllZero()
Make all bits known to be zero and discard any previous information.
Definition KnownBits.h:86
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:165
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
bool isNonZero() const
Returns true if this value is known to be non-zero.
Definition KnownBits.h:104
static LLVM_ABI KnownBits abdu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits for abdu(LHS, RHS).
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:218
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:289
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:304
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:173
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:340
KnownBits zextOrTrunc(unsigned BitWidth) const
Return known bits for a zero extension or truncation of the value we're tracking.
Definition KnownBits.h:189
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:241
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:138
static LLVM_ABI KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition KnownBits.cpp:60
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:98
void setAllOnes()
Make all bits known to be one and discard any previous information.
Definition KnownBits.h:92
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
static LLVM_ABI std::optional< bool > sgt(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_SGT result.
bool isAllOnes() const
Returns true if value is all one bits.
Definition KnownBits.h:83
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition KnownBits.h:60
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasAllowContract() const
bool hasNoSignedZeros() const
void setNoSignedWrap(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setChain(SDValue InChain)
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI bool recursivelyDeleteUnusedNodes(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
This structure is used to pass arguments to makeLibCall function.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
X86AddressMode - This struct holds a generalized full x86 address mode.