LLVM 22.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86FrameLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86IntrinsicsInfo.h"
21#include "X86TargetMachine.h"
23#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
51#include "llvm/IR/IRBuilder.h"
53#include "llvm/IR/Intrinsics.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
60#include "llvm/Support/Debug.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE "x86-isel"
72
74 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
76 "Sets the preferable loop alignment for experiments (as log2 bytes) "
77 "for innermost loops only. If specified, this option overrides "
78 "alignment set by x86-experimental-pref-loop-alignment."),
80
82 "x86-br-merging-base-cost", cl::init(2),
84 "Sets the cost threshold for when multiple conditionals will be merged "
85 "into one branch versus be split in multiple branches. Merging "
86 "conditionals saves branches at the cost of additional instructions. "
87 "This value sets the instruction cost limit, below which conditionals "
88 "will be merged, and above which conditionals will be split. Set to -1 "
89 "to never merge branches."),
91
93 "x86-br-merging-ccmp-bias", cl::init(6),
94 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
95 "supports conditional compare instructions."),
97
98static cl::opt<bool>
99 WidenShift("x86-widen-shift", cl::init(true),
100 cl::desc("Replace narrow shifts with wider shifts."),
101 cl::Hidden);
102
104 "x86-br-merging-likely-bias", cl::init(0),
105 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
106 "that all conditionals will be executed. For example for merging "
107 "the conditionals (a == b && c > d), if its known that a == b is "
108 "likely, then it is likely that if the conditionals are split "
109 "both sides will be executed, so it may be desirable to increase "
110 "the instruction cost threshold. Set to -1 to never merge likely "
111 "branches."),
112 cl::Hidden);
113
115 "x86-br-merging-unlikely-bias", cl::init(-1),
116 cl::desc(
117 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
118 "that all conditionals will be executed. For example for merging "
119 "the conditionals (a == b && c > d), if its known that a == b is "
120 "unlikely, then it is unlikely that if the conditionals are split "
121 "both sides will be executed, so it may be desirable to decrease "
122 "the instruction cost threshold. Set to -1 to never merge unlikely "
123 "branches."),
124 cl::Hidden);
125
127 "mul-constant-optimization", cl::init(true),
128 cl::desc("Replace 'mul x, Const' with more effective instructions like "
129 "SHIFT, LEA, etc."),
130 cl::Hidden);
131
133 const X86Subtarget &STI)
134 : TargetLowering(TM), Subtarget(STI) {
135 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
136 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
137
138 // Set up the TargetLowering object.
139
140 // X86 is weird. It always uses i8 for shift amounts and setcc results.
142 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
144
145 // X86 instruction cache is coherent with its data cache so we can use the
146 // default expansion to a no-op.
148
149 // For 64-bit, since we have so many registers, use the ILP scheduler.
150 // For 32-bit, use the register pressure specific scheduling.
151 // For Atom, always use ILP scheduling.
152 if (Subtarget.isAtom())
154 else if (Subtarget.is64Bit())
156 else
158 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
159 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
160
161 // Bypass expensive divides and use cheaper ones.
162 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
163 if (Subtarget.hasSlowDivide32())
164 addBypassSlowDiv(32, 8);
165 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
166 addBypassSlowDiv(64, 32);
167 }
168
169 if (Subtarget.canUseCMPXCHG16B())
171 else if (Subtarget.canUseCMPXCHG8B())
173 else
175
176 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
177
179
180 // Set up the register classes.
181 addRegisterClass(MVT::i8, &X86::GR8RegClass);
182 addRegisterClass(MVT::i16, &X86::GR16RegClass);
183 addRegisterClass(MVT::i32, &X86::GR32RegClass);
184 if (Subtarget.is64Bit())
185 addRegisterClass(MVT::i64, &X86::GR64RegClass);
186
187 for (MVT VT : MVT::integer_valuetypes())
189
190 // We don't accept any truncstore of integer registers.
191 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
192 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
193 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
194 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
195 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
196 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
197
198 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
199
200 // SETOEQ and SETUNE require checking two conditions.
201 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
204 }
205
206 // Integer absolute.
207 if (Subtarget.canUseCMOV()) {
208 setOperationAction(ISD::ABS , MVT::i16 , Custom);
209 setOperationAction(ISD::ABS , MVT::i32 , Custom);
210 if (Subtarget.is64Bit())
211 setOperationAction(ISD::ABS , MVT::i64 , Custom);
212 }
213
214 // Absolute difference.
215 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
216 setOperationAction(Op , MVT::i8 , Custom);
217 setOperationAction(Op , MVT::i16 , Custom);
218 setOperationAction(Op , MVT::i32 , Custom);
219 if (Subtarget.is64Bit())
220 setOperationAction(Op , MVT::i64 , Custom);
221 }
222
223 // Signed saturation subtraction.
227 if (Subtarget.is64Bit())
229
230 // Funnel shifts.
231 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
232 // For slow shld targets we only lower for code size.
233 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
234
235 setOperationAction(ShiftOp , MVT::i8 , Custom);
236 setOperationAction(ShiftOp , MVT::i16 , Custom);
237 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
238 if (Subtarget.is64Bit())
239 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
240 }
241
242 if (!Subtarget.useSoftFloat()) {
243 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
244 // operation.
249 // We have an algorithm for SSE2, and we turn this into a 64-bit
250 // FILD or VCVTUSI2SS/SD for other targets.
253 // We have an algorithm for SSE2->double, and we turn this into a
254 // 64-bit FILD followed by conditional FADD for other targets.
257
258 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
259 // this operation.
262 // SSE has no i16 to fp conversion, only i32. We promote in the handler
263 // to allow f80 to use i16 and f64 to use i16 with sse1 only
266 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
269 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
270 // are Legal, f80 is custom lowered.
273
274 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
275 // this operation.
277 // FIXME: This doesn't generate invalid exception when it should. PR44019.
283 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
284 // are Legal, f80 is custom lowered.
287
288 // Handle FP_TO_UINT by promoting the destination to a larger signed
289 // conversion.
291 // FIXME: This doesn't generate invalid exception when it should. PR44019.
294 // FIXME: This doesn't generate invalid exception when it should. PR44019.
300
301 setOperationAction(ISD::LRINT, MVT::f32, Custom);
302 setOperationAction(ISD::LRINT, MVT::f64, Custom);
303 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
304 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
305
306 if (!Subtarget.is64Bit()) {
307 setOperationAction(ISD::LRINT, MVT::i64, Custom);
308 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
309 }
310 }
311
312 if (Subtarget.hasSSE2()) {
313 // Custom lowering for saturating float to int conversions.
314 // We handle promotion to larger result types manually.
315 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
318 }
321 if (Subtarget.is64Bit()) {
324 }
325 }
326 if (Subtarget.hasAVX10_2()) {
331 for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
332 MVT::v4i64}) {
335 }
336 if (Subtarget.is64Bit()) {
339 }
340 }
341
342 // Handle address space casts between mixed sized pointers.
343 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
344 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
345
346 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
347 if (!Subtarget.hasSSE2()) {
348 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
349 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
352 if (Subtarget.is64Bit()) {
353 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
354 // Without SSE, i64->f64 goes through memory.
355 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
356 }
357 } else if (!Subtarget.is64Bit())
358 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
359
360 // Scalar integer divide and remainder are lowered to use operations that
361 // produce two results, to match the available instructions. This exposes
362 // the two-result form to trivial CSE, which is able to combine x/y and x%y
363 // into a single instruction.
364 //
365 // Scalar integer multiply-high is also lowered to use two-result
366 // operations, to match the available instructions. However, plain multiply
367 // (low) operations are left as Legal, as there are single-result
368 // instructions for this in x86. Using the two-result multiply instructions
369 // when both high and low results are needed must be arranged by dagcombine.
370 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
377 }
378
379 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
380 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
381 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
382 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
383 setOperationAction(ISD::BR_CC, VT, Expand);
385 }
386 if (Subtarget.is64Bit())
391
392 setOperationAction(ISD::FREM , MVT::f32 , Expand);
393 setOperationAction(ISD::FREM , MVT::f64 , Expand);
394 setOperationAction(ISD::FREM , MVT::f80 , Expand);
395 setOperationAction(ISD::FREM , MVT::f128 , Expand);
396
397 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
399 setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
400 setOperationAction(ISD::GET_FPENV_MEM , MVT::Other, Custom);
401 setOperationAction(ISD::SET_FPENV_MEM , MVT::Other, Custom);
402 setOperationAction(ISD::RESET_FPENV , MVT::Other, Custom);
403 }
404
405 // Promote the i8 variants and force them on up to i32 which has a shorter
406 // encoding.
407 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
409 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
410 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
411 // promote that too.
412 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
414
415 if (!Subtarget.hasBMI()) {
416 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
418 if (Subtarget.is64Bit()) {
419 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
421 }
422 }
423
424 if (Subtarget.hasLZCNT()) {
425 // When promoting the i8 variants, force them to i32 for a shorter
426 // encoding.
427 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
429 } else {
430 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
431 if (VT == MVT::i64 && !Subtarget.is64Bit())
432 continue;
435 }
436 }
437
438 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
439 ISD::STRICT_FP_TO_FP16}) {
440 // Special handling for half-precision floating point conversions.
441 // If we don't have F16C support, then lower half float conversions
442 // into library calls.
444 Op, MVT::f32,
445 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
446 // There's never any support for operations beyond MVT::f32.
447 setOperationAction(Op, MVT::f64, Expand);
448 setOperationAction(Op, MVT::f80, Expand);
449 setOperationAction(Op, MVT::f128, Expand);
450 }
451
452 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
453 setOperationAction(ISD::STRICT_FP_TO_BF16, VT, Expand);
454 setOperationAction(ISD::STRICT_BF16_TO_FP, VT, Expand);
455 }
456
457 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
458 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
459 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
460 setTruncStoreAction(VT, MVT::f16, Expand);
461 setTruncStoreAction(VT, MVT::bf16, Expand);
462
463 setOperationAction(ISD::BF16_TO_FP, VT, Expand);
464 setOperationAction(ISD::FP_TO_BF16, VT, Custom);
465 }
466
470 if (Subtarget.is64Bit())
472 if (Subtarget.hasPOPCNT()) {
473 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
474 // popcntw is longer to encode than popcntl and also has a false dependency
475 // on the dest that popcntl hasn't had since Cannon Lake.
476 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
477 } else {
482 }
483
484 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
485
486 if (!Subtarget.hasMOVBE())
488
489 // X86 wants to expand cmov itself.
490 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
495 }
496 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
497 if (VT == MVT::i64 && !Subtarget.is64Bit())
498 continue;
501 }
502
503 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
506
508 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
509 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
513
514 // Darwin ABI issue.
515 for (auto VT : { MVT::i32, MVT::i64 }) {
516 if (VT == MVT::i64 && !Subtarget.is64Bit())
517 continue;
524 }
525
526 // 64-bit shl, sra, srl (iff 32-bit x86)
527 for (auto VT : { MVT::i32, MVT::i64 }) {
528 if (VT == MVT::i64 && !Subtarget.is64Bit())
529 continue;
533 }
534
535 if (Subtarget.hasSSEPrefetch())
536 setOperationAction(ISD::PREFETCH , MVT::Other, Custom);
537
538 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
539
540 // Expand certain atomics
541 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
542 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
543 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
544 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
545 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
546 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
547 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
548 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
549 }
550
551 if (!Subtarget.is64Bit())
552 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
553
554 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
555 // All CPUs supporting AVX will atomically load/store aligned 128-bit
556 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
557 setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
558 setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
559 }
560
561 if (Subtarget.canUseCMPXCHG16B())
562 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
563
564 // FIXME - use subtarget debug flags
565 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
566 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
567 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
568 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
569 }
570
573
574 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
575 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
576
577 setOperationAction(ISD::TRAP, MVT::Other, Legal);
578 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
579 if (Subtarget.isTargetPS())
580 setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
581 else
582 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
583
584 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
585 setOperationAction(ISD::VASTART , MVT::Other, Custom);
586 setOperationAction(ISD::VAEND , MVT::Other, Expand);
587 bool Is64Bit = Subtarget.is64Bit();
588 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
589 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
590
591 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
592 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
593
594 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
595
596 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
597 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
598 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
599
601
602 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
603 setOperationAction(ISD::FABS, VT, Action);
604 setOperationAction(ISD::FNEG, VT, Action);
606 setOperationAction(ISD::FREM, VT, Action);
607 setOperationAction(ISD::FMA, VT, Action);
608 setOperationAction(ISD::FMINNUM, VT, Action);
609 setOperationAction(ISD::FMAXNUM, VT, Action);
610 setOperationAction(ISD::FMINIMUM, VT, Action);
611 setOperationAction(ISD::FMAXIMUM, VT, Action);
612 setOperationAction(ISD::FMINIMUMNUM, VT, Action);
613 setOperationAction(ISD::FMAXIMUMNUM, VT, Action);
614 setOperationAction(ISD::FSIN, VT, Action);
615 setOperationAction(ISD::FCOS, VT, Action);
616 setOperationAction(ISD::FSINCOS, VT, Action);
617 setOperationAction(ISD::FTAN, VT, Action);
618 setOperationAction(ISD::FSQRT, VT, Action);
619 setOperationAction(ISD::FPOW, VT, Action);
620 setOperationAction(ISD::FPOWI, VT, Action);
621 setOperationAction(ISD::FLOG, VT, Action);
622 setOperationAction(ISD::FLOG2, VT, Action);
623 setOperationAction(ISD::FLOG10, VT, Action);
624 setOperationAction(ISD::FEXP, VT, Action);
625 setOperationAction(ISD::FEXP2, VT, Action);
626 setOperationAction(ISD::FEXP10, VT, Action);
627 setOperationAction(ISD::FCEIL, VT, Action);
628 setOperationAction(ISD::FFLOOR, VT, Action);
629 setOperationAction(ISD::FNEARBYINT, VT, Action);
630 setOperationAction(ISD::FRINT, VT, Action);
631 setOperationAction(ISD::BR_CC, VT, Action);
632 setOperationAction(ISD::SETCC, VT, Action);
635 setOperationAction(ISD::FROUND, VT, Action);
636 setOperationAction(ISD::FROUNDEVEN, VT, Action);
637 setOperationAction(ISD::FTRUNC, VT, Action);
638 setOperationAction(ISD::FLDEXP, VT, Action);
639 };
640
641 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
642 // f16, f32 and f64 use SSE.
643 // Set up the FP register classes.
644 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
645 : &X86::FR16RegClass);
646 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
647 : &X86::FR32RegClass);
648 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
649 : &X86::FR64RegClass);
650
651 // Disable f32->f64 extload as we can only generate this in one instruction
652 // under optsize. So its easier to pattern match (fpext (load)) for that
653 // case instead of needing to emit 2 instructions for extload in the
654 // non-optsize case.
655 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
656
657 for (auto VT : { MVT::f32, MVT::f64 }) {
658 // Use ANDPD to simulate FABS.
659 setOperationAction(ISD::FABS, VT, Custom);
660
661 // Use XORP to simulate FNEG.
662 setOperationAction(ISD::FNEG, VT, Custom);
663
664 // Use ANDPD and ORPD to simulate FCOPYSIGN.
666
667 // These might be better off as horizontal vector ops.
670
671 // We don't support sin/cos/fmod
672 setOperationAction(ISD::FSIN , VT, Expand);
673 setOperationAction(ISD::FCOS , VT, Expand);
674 setOperationAction(ISD::FSINCOS, VT, Expand);
675 }
676
677 // Half type will be promoted by default.
678 setF16Action(MVT::f16, Promote);
683 setOperationAction(ISD::FABS, MVT::f16, Custom);
684 setOperationAction(ISD::FNEG, MVT::f16, Custom);
687 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
688 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
689
720 setOperationAction(ISD::LRINT, MVT::f16, Expand);
721 setOperationAction(ISD::LLRINT, MVT::f16, Expand);
722
723 // Lower this to MOVMSK plus an AND.
726
727 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
728 (UseX87 || Is64Bit)) {
729 // Use SSE for f32, x87 for f64.
730 // Set up the FP register classes.
731 addRegisterClass(MVT::f32, &X86::FR32RegClass);
732 if (UseX87)
733 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
734
735 // Use ANDPS to simulate FABS.
736 setOperationAction(ISD::FABS , MVT::f32, Custom);
737
738 // Use XORP to simulate FNEG.
739 setOperationAction(ISD::FNEG , MVT::f32, Custom);
740
741 if (UseX87)
743
744 // Use ANDPS and ORPS to simulate FCOPYSIGN.
745 if (UseX87)
748
749 // We don't support sin/cos/fmod
750 setOperationAction(ISD::FSIN , MVT::f32, Expand);
751 setOperationAction(ISD::FCOS , MVT::f32, Expand);
752 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
753
754 if (UseX87) {
755 // Always expand sin/cos functions even though x87 has an instruction.
756 setOperationAction(ISD::FSIN, MVT::f64, Expand);
757 setOperationAction(ISD::FCOS, MVT::f64, Expand);
758 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
759 }
760 } else if (UseX87) {
761 // f32 and f64 in x87.
762 // Set up the FP register classes.
763 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
764 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
765
766 for (auto VT : { MVT::f32, MVT::f64 }) {
769
770 // Always expand sin/cos functions even though x87 has an instruction.
771 setOperationAction(ISD::FSIN , VT, Expand);
772 setOperationAction(ISD::FCOS , VT, Expand);
773 setOperationAction(ISD::FSINCOS, VT, Expand);
774 }
775 }
776
777 // Expand FP32 immediates into loads from the stack, save special cases.
778 if (isTypeLegal(MVT::f32)) {
779 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
780 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
781 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
782 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
783 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
784 } else // SSE immediates.
785 addLegalFPImmediate(APFloat(+0.0f)); // xorps
786 }
787 // Expand FP64 immediates into loads from the stack, save special cases.
788 if (isTypeLegal(MVT::f64)) {
789 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
790 addLegalFPImmediate(APFloat(+0.0)); // FLD0
791 addLegalFPImmediate(APFloat(+1.0)); // FLD1
792 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
793 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
794 } else // SSE immediates.
795 addLegalFPImmediate(APFloat(+0.0)); // xorpd
796 }
797 // Support fp16 0 immediate.
798 if (isTypeLegal(MVT::f16))
799 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
800
801 // Handle constrained floating-point operations of scalar.
814
815 // We don't support FMA.
818
819 // f80 always uses X87.
820 if (UseX87) {
821 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
824 {
826 addLegalFPImmediate(TmpFlt); // FLD0
827 TmpFlt.changeSign();
828 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
829
830 bool ignored;
831 APFloat TmpFlt2(+1.0);
833 &ignored);
834 addLegalFPImmediate(TmpFlt2); // FLD1
835 TmpFlt2.changeSign();
836 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
837 }
838
839 // Always expand sin/cos functions even though x87 has an instruction.
840 // clang-format off
841 setOperationAction(ISD::FSIN , MVT::f80, Expand);
842 setOperationAction(ISD::FCOS , MVT::f80, Expand);
843 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
844 setOperationAction(ISD::FTAN , MVT::f80, Expand);
845 setOperationAction(ISD::FASIN , MVT::f80, Expand);
846 setOperationAction(ISD::FACOS , MVT::f80, Expand);
847 setOperationAction(ISD::FATAN , MVT::f80, Expand);
848 setOperationAction(ISD::FATAN2 , MVT::f80, Expand);
849 setOperationAction(ISD::FSINH , MVT::f80, Expand);
850 setOperationAction(ISD::FCOSH , MVT::f80, Expand);
851 setOperationAction(ISD::FTANH , MVT::f80, Expand);
852 // clang-format on
853
854 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
855 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
856 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
857 setOperationAction(ISD::FRINT, MVT::f80, Expand);
858 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
859 setOperationAction(ISD::FROUNDEVEN, MVT::f80, Expand);
861 setOperationAction(ISD::LROUND, MVT::f80, LibCall);
862 setOperationAction(ISD::LLROUND, MVT::f80, LibCall);
863 setOperationAction(ISD::LRINT, MVT::f80, Custom);
864 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
865
866 // Handle constrained floating-point operations of scalar.
873 if (isTypeLegal(MVT::f16)) {
874 setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
876 } else {
878 }
879 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
880 // as Custom.
882 }
883
884 // f128 uses xmm registers, but most operations require libcalls.
885 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
886 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
887 : &X86::VR128RegClass);
888
889 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
890
901
902 setOperationAction(ISD::FABS, MVT::f128, Custom);
903 setOperationAction(ISD::FNEG, MVT::f128, Custom);
905
906 // clang-format off
907 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
909 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
911 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
912 setOperationAction(ISD::FTAN, MVT::f128, LibCall);
914 // clang-format on
915 // No STRICT_FSINCOS
916 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
918
919 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
921 // We need to custom handle any FP_ROUND with an f128 input, but
922 // LegalizeDAG uses the result type to know when to run a custom handler.
923 // So we have to list all legal floating point result types here.
924 if (isTypeLegal(MVT::f32)) {
927 }
928 if (isTypeLegal(MVT::f64)) {
931 }
932 if (isTypeLegal(MVT::f80)) {
936 }
937
939
940 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
941 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
942 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
943 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
944 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
945 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
946 }
947
948 // Always use a library call for pow.
949 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
950 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
951 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
952 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
953
954 setOperationAction(ISD::FLOG, MVT::f80, Expand);
955 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
956 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
957 setOperationAction(ISD::FEXP, MVT::f80, Expand);
958 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
959 setOperationAction(ISD::FEXP10, MVT::f80, Expand);
960 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
961 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
962
963 // Some FP actions are always expanded for vector types.
964 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
965 MVT::v4f32, MVT::v8f32, MVT::v16f32,
966 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
967 // clang-format off
968 setOperationAction(ISD::FSIN, VT, Expand);
969 setOperationAction(ISD::FSINCOS, VT, Expand);
970 setOperationAction(ISD::FCOS, VT, Expand);
971 setOperationAction(ISD::FTAN, VT, Expand);
974 setOperationAction(ISD::FPOW, VT, Expand);
975 setOperationAction(ISD::FLOG, VT, Expand);
976 setOperationAction(ISD::FLOG2, VT, Expand);
977 setOperationAction(ISD::FLOG10, VT, Expand);
978 setOperationAction(ISD::FEXP, VT, Expand);
979 setOperationAction(ISD::FEXP2, VT, Expand);
980 setOperationAction(ISD::FEXP10, VT, Expand);
981 // clang-format on
982 }
983
984 // First set operation action for all vector types to either promote
985 // (for widening) or expand (for scalarization). Then we will selectively
986 // turn on ones that can be effectively codegen'd.
997 setOperationAction(ISD::FFLOOR, VT, Expand);
998 setOperationAction(ISD::FCEIL, VT, Expand);
999 setOperationAction(ISD::FTRUNC, VT, Expand);
1000 setOperationAction(ISD::FRINT, VT, Expand);
1001 setOperationAction(ISD::FNEARBYINT, VT, Expand);
1002 setOperationAction(ISD::FROUNDEVEN, VT, Expand);
1026 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1027 setTruncStoreAction(InnerVT, VT, Expand);
1028
1029 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1030 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1031
1032 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1033 // types, we have to deal with them whether we ask for Expansion or not.
1034 // Setting Expand causes its own optimisation problems though, so leave
1035 // them legal.
1036 if (VT.getVectorElementType() == MVT::i1)
1037 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1038
1039 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1040 // split/scalarized right now.
1041 if (VT.getVectorElementType() == MVT::f16 ||
1042 VT.getVectorElementType() == MVT::bf16)
1043 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1044 }
1045 }
1046
1047 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1048 // with -msoft-float, disable use of MMX as well.
1049 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1050 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1051 // No operations on x86mmx supported, everything uses intrinsics.
1052 }
1053
1054 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1055 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1056 : &X86::VR128RegClass);
1057
1058 setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom);
1059 setOperationAction(ISD::FMINIMUM, MVT::f32, Custom);
1060 setOperationAction(ISD::FMAXIMUMNUM, MVT::f32, Custom);
1061 setOperationAction(ISD::FMINIMUMNUM, MVT::f32, Custom);
1062
1063 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1064 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1072
1073 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1074 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1076
1082 }
1083
1084 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1085 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1086 : &X86::VR128RegClass);
1087
1088 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1089 // registers cannot be used even for integer operations.
1090 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1091 : &X86::VR128RegClass);
1092 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1093 : &X86::VR128RegClass);
1094 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1095 : &X86::VR128RegClass);
1096 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1097 : &X86::VR128RegClass);
1098 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1099 : &X86::VR128RegClass);
1100
1101 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1102 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1103 setOperationAction(ISD::FMINIMUM, VT, Custom);
1104 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1105 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1106 }
1107
1108 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1109 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1114 }
1115
1116 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1117 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1118 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1119
1120 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1121 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1122 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1123 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1124 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1125 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1126 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1127 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1128 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1129 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1132
1133 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1134 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1135 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1136
1137 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1139 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1141
1142 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1143 setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
1144
1145 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1146 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1147 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1148 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1149 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1150 }
1151
1162
1167
1168 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1174
1175 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1176 // setcc all the way to isel and prefer SETGT in some isel patterns.
1179 }
1180
1181 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1182 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1187
1188 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1194 }
1195
1196 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1200
1201 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1202 continue;
1203
1206 }
1207 setF16Action(MVT::v8f16, Expand);
1208 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1209 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1210 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1211 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1212 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1213 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1215
1216 // Custom lower v2i64 and v2f64 selects.
1223
1230
1231 // Custom legalize these to avoid over promotion or custom promotion.
1232 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1237 }
1238
1243
1246
1249
1250 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1255
1256 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1260
1261 // We want to legalize this to an f64 load rather than an i64 load on
1262 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1263 // store.
1264 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1265 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1266 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1267 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1268 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1269 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1270
1271 // Add 32-bit vector stores to help vectorization opportunities.
1272 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1273 setOperationAction(ISD::STORE, MVT::v4i8, Custom);
1274
1275 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1276 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1277 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1278 if (!Subtarget.hasAVX512())
1279 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1280
1284
1286
1303
1304 // In the customized shift lowering, the legal v4i32/v2i64 cases
1305 // in AVX2 will be recognized.
1306 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1310 if (VT == MVT::v2i64) continue;
1315 }
1316
1322 }
1323
1324 if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
1329
1330 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1332 }
1333 }
1334
1335 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1336 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1337 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1338 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1339
1340 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1343 }
1344
1345 // These might be better off as horizontal vector ops.
1350 }
1351
1352 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1353 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1354 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1356 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1358 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1360 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1362 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1364 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1366
1367 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1368 }
1369
1370 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1371 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1372 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1373 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1374 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1375 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1376 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1377 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1378
1382
1383 // FIXME: Do we need to handle scalar-to-vector here?
1384 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1385 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1386
1387 // We directly match byte blends in the backend as they match the VSELECT
1388 // condition form.
1390
1391 // SSE41 brings specific instructions for doing vector sign extend even in
1392 // cases where we don't have SRA.
1393 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1396 }
1397
1398 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1399 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1400 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1401 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1402 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1403 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1404 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1405 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1406 }
1407
1408 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1409 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1410 // do the pre and post work in the vector domain.
1413 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1414 // so that DAG combine doesn't try to turn it into uint_to_fp.
1417 }
1418 }
1419
1420 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1422 }
1423
1424 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1425 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1426 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1429 }
1430
1431 // XOP can efficiently perform BITREVERSE with VPPERM.
1432 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1434 }
1435
1436 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1437 bool HasInt256 = Subtarget.hasInt256();
1438
1439 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1440 : &X86::VR256RegClass);
1441 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1442 : &X86::VR256RegClass);
1443 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1444 : &X86::VR256RegClass);
1445 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1446 : &X86::VR256RegClass);
1447 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1448 : &X86::VR256RegClass);
1449 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1450 : &X86::VR256RegClass);
1451 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1452 : &X86::VR256RegClass);
1453
1454 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1455 setOperationAction(ISD::FFLOOR, VT, Legal);
1457 setOperationAction(ISD::FCEIL, VT, Legal);
1459 setOperationAction(ISD::FTRUNC, VT, Legal);
1461 setOperationAction(ISD::FRINT, VT, Legal);
1463 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1465 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1467
1468 setOperationAction(ISD::FROUND, VT, Custom);
1469
1470 setOperationAction(ISD::FNEG, VT, Custom);
1471 setOperationAction(ISD::FABS, VT, Custom);
1473
1474 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1475 setOperationAction(ISD::FMINIMUM, VT, Custom);
1476 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1477 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1479 }
1480
1481 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1482 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1483
1484 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1485 // even though v8i16 is a legal type.
1486 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1487 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1488 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1489 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1493
1496 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);
1498 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
1500
1512
1513 if (!Subtarget.hasAVX512())
1514 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1515
1516 // In the customized shift lowering, the legal v8i32/v4i64 cases
1517 // in AVX2 will be recognized.
1518 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1524 if (VT == MVT::v4i64) continue;
1529 }
1530
1531 // These types need custom splitting if their input is a 128-bit vector.
1536
1540 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1541 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1544
1545 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1549 }
1550
1555
1556 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1561
1562 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1563 // setcc all the way to isel and prefer SETGT in some isel patterns.
1566 }
1567
1568 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1569 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1574
1575 if (Subtarget.hasAnyFMA()) {
1576 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1577 MVT::v2f64, MVT::v4f64 }) {
1580 }
1581 }
1582
1583 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1584 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1585 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1586 }
1587
1588 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1589 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1590 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1591 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1592
1593 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1594 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1595 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1596 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1597 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1598 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1599 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1600 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1601
1602 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1603 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1604
1605 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1606 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1607 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1608 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1609 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1610
1611 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1612 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1613 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1614 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1615 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1616 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1617 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1618 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1623
1624 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1625 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1626 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1627 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1628 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1629 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1630 }
1631
1632 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1635 }
1636
1637 if (HasInt256) {
1638 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1639 // when we have a 256bit-wide blend with immediate.
1642
1643 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1644 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1645 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1646 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1647 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1648 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1649 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1650 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1651 }
1652 }
1653
1654 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1655 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1656 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1657 setOperationAction(ISD::MSTORE, VT, Legal);
1658 }
1659
1660 // Extract subvector is special because the value type
1661 // (result) is 128-bit but the source is 256-bit wide.
1662 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1663 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1665 }
1666
1667 // Custom lower several nodes for 256-bit types.
1668 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1669 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1678 setOperationAction(ISD::STORE, VT, Custom);
1679 }
1680 setF16Action(MVT::v16f16, Expand);
1681 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1682 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1684 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1685 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1686 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1687 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1688
1689 if (HasInt256) {
1691
1692 // Custom legalize 2x32 to get a little better code.
1693 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1694 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1695
1696 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1697 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1698 setOperationAction(ISD::MGATHER, VT, Custom);
1699 }
1700 }
1701
1702 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1703 Subtarget.hasF16C()) {
1704 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1707 }
1708 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1709 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1711 }
1712 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1713 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1714 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1715 }
1716 setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
1717 setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
1718 }
1719
1720 // This block controls legalization of the mask vector sizes that are
1721 // available with AVX512. 512-bit vectors are in a separate block controlled
1722 // by useAVX512Regs.
1723 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1724 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1725 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1726 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1727 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1728 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1729
1733
1734 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1735 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1736 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1737 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1738 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1739 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1740 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1741 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1749
1750 // There is no byte sized k-register load or store without AVX512DQ.
1751 if (!Subtarget.hasDQI()) {
1752 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1753 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1754 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1755 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1756
1757 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1758 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1759 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1760 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1761 }
1762
1763 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1764 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1768 }
1769
1770 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1772
1773 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1777
1784 }
1785
1786 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1788 }
1789 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1790 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1791 setOperationAction(ISD::LRINT, VT, Legal);
1792 setOperationAction(ISD::LLRINT, VT, Legal);
1793 }
1794 }
1795
1796 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1797 // elements. 512-bits can be disabled based on prefer-vector-width and
1798 // required-vector-width function attributes.
1799 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1800 bool HasBWI = Subtarget.hasBWI();
1801
1802 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1803 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1804 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1805 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1806 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1807 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1808 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1809
1810 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1811 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1812 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1813 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1814 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1815 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1816 if (HasBWI)
1817 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1818 }
1819
1820 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1821 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1822 setOperationAction(ISD::FMINIMUM, VT, Custom);
1823 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1824 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1825 setOperationAction(ISD::FNEG, VT, Custom);
1826 setOperationAction(ISD::FABS, VT, Custom);
1831 }
1832 setOperationAction(ISD::LRINT, MVT::v16f32,
1833 Subtarget.hasDQI() ? Legal : Custom);
1834 setOperationAction(ISD::LRINT, MVT::v8f64,
1835 Subtarget.hasDQI() ? Legal : Custom);
1836 if (Subtarget.hasDQI())
1837 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1838
1839 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1844 }
1845
1846 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1851 }
1852
1857 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
1859
1871
1872 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1873 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1874 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1875 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1876 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1877 if (HasBWI)
1878 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1879
1880 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1881 // to 512-bit rather than use the AVX2 instructions so that we can use
1882 // k-masks.
1883 if (!Subtarget.hasVLX()) {
1884 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1885 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1886 setOperationAction(ISD::MLOAD, VT, Custom);
1887 setOperationAction(ISD::MSTORE, VT, Custom);
1888 }
1889 }
1890
1892 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1893 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1903
1904 if (HasBWI) {
1905 // Extends from v64i1 masks to 512-bit vectors.
1909 }
1910
1911 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1912 setOperationAction(ISD::FFLOOR, VT, Legal);
1914 setOperationAction(ISD::FCEIL, VT, Legal);
1916 setOperationAction(ISD::FTRUNC, VT, Legal);
1918 setOperationAction(ISD::FRINT, VT, Legal);
1920 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1922 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1924
1925 setOperationAction(ISD::FROUND, VT, Custom);
1926 }
1927
1928 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1931 }
1932
1933 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1934 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1935 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1936 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1937
1938 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1939 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1940 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1941 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1942
1943 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1944 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1945 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1946 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1947 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1948 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1949 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1950 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1951
1952 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1953 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1954
1955 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1965
1966 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1967 // setcc all the way to isel and prefer SETGT in some isel patterns.
1970 }
1971
1972 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1973 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1978
1979 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1986 }
1987
1988 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1989 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1990 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1992 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1993 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1994 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1995 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
2000 }
2001
2002 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
2003 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
2004 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
2005 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
2006 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
2007 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
2008
2009 if (Subtarget.hasDQI() || Subtarget.hasFP16())
2013 setOperationAction(Opc, MVT::v8i64, Custom);
2014
2015 if (Subtarget.hasDQI())
2016 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2017
2018 if (Subtarget.hasCDI()) {
2019 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2020 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
2022 }
2023 } // Subtarget.hasCDI()
2024
2025 if (Subtarget.hasVPOPCNTDQ()) {
2026 for (auto VT : { MVT::v16i32, MVT::v8i64 })
2028 }
2029
2030 // Extract subvector is special because the value type
2031 // (result) is 256-bit but the source is 512-bit wide.
2032 // 128-bit was made Legal under AVX1.
2033 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2034 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2036
2037 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2038 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2048 }
2049 setF16Action(MVT::v32f16, Expand);
2052 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
2054 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2055 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2056 setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
2057
2058 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2059 setOperationAction(ISD::MLOAD, VT, Legal);
2060 setOperationAction(ISD::MSTORE, VT, Legal);
2061 setOperationAction(ISD::MGATHER, VT, Custom);
2062 setOperationAction(ISD::MSCATTER, VT, Custom);
2063 }
2064 if (HasBWI) {
2065 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2066 setOperationAction(ISD::MLOAD, VT, Legal);
2067 setOperationAction(ISD::MSTORE, VT, Legal);
2068 }
2069 } else {
2070 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2071 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2072 }
2073
2074 if (Subtarget.hasVBMI2()) {
2075 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2078 }
2079
2080 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2081 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2082 }
2083
2084 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2085 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2087 }// useAVX512Regs
2088
2089 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2090 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2091 MVT::v4i64}) {
2094 }
2095 }
2096
2097 // This block controls legalization for operations that don't have
2098 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2099 // narrower widths.
2100 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2101 // These operations are handled on non-VLX by artificially widening in
2102 // isel patterns.
2103
2107
2108 if (Subtarget.hasDQI()) {
2109 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2110 // v2f32 UINT_TO_FP is already custom under SSE2.
2113 "Unexpected operation action!");
2114 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2119 }
2120
2121 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2127 }
2128
2129 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2132 }
2133
2134 // Custom legalize 2x32 to get a little better code.
2135 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
2136 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
2137
2138 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2139 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2140 setOperationAction(ISD::MSCATTER, VT, Custom);
2141
2142 if (Subtarget.hasDQI()) {
2146 setOperationAction(Opc, MVT::v2i64, Custom);
2147 setOperationAction(Opc, MVT::v4i64, Custom);
2148 }
2149 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2150 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2151 }
2152
2153 if (Subtarget.hasCDI()) {
2154 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2156 }
2157 } // Subtarget.hasCDI()
2158
2159 if (Subtarget.hasVPOPCNTDQ()) {
2160 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2162 }
2163
2164 // We can try to convert vectors to different sizes to leverage legal
2165 // `vpcompress` cases. So we mark these supported vector sizes as Custom and
2166 // then specialize to Legal below.
2167 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2168 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2169 MVT::v16i16, MVT::v8i8})
2171
2172 // Legal vpcompress depends on various AVX512 extensions.
2173 // Legal in AVX512F
2174 for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2176
2177 // Legal in AVX512F + AVX512VL
2178 if (Subtarget.hasVLX())
2179 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2180 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2182
2183 // Legal in AVX512F + AVX512VBMI2
2184 if (Subtarget.hasVBMI2())
2185 for (MVT VT : {MVT::v32i16, MVT::v64i8})
2187
2188 // Legal in AVX512F + AVX512VL + AVX512VBMI2
2189 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2190 for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2192 }
2193
2194 // This block control legalization of v32i1/v64i1 which are available with
2195 // AVX512BW..
2196 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2197 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2198 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2199
2200 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2211 }
2212
2213 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2215
2216 // Extends from v32i1 masks to 256-bit vectors.
2220
2221 for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
2222 MVT::v16f16, MVT::v8f16}) {
2223 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2224 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2225 }
2226
2227 // These operations are handled on non-VLX by artificially widening in
2228 // isel patterns.
2229 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2230
2231 if (Subtarget.hasBITALG()) {
2232 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2234 }
2235 }
2236
2237 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2238 auto setGroup = [&] (MVT VT) {
2247 setOperationAction(ISD::FSQRT, VT, Legal);
2249
2250 setOperationAction(ISD::FFLOOR, VT, Legal);
2252 setOperationAction(ISD::FCEIL, VT, Legal);
2254 setOperationAction(ISD::FTRUNC, VT, Legal);
2256 setOperationAction(ISD::FRINT, VT, Legal);
2258 setOperationAction(ISD::FNEARBYINT, VT, Legal);
2260 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
2262
2263 setOperationAction(ISD::FROUND, VT, Custom);
2264
2265 setOperationAction(ISD::LOAD, VT, Legal);
2266 setOperationAction(ISD::STORE, VT, Legal);
2267
2273
2274 setOperationAction(ISD::FNEG, VT, Custom);
2275 setOperationAction(ISD::FABS, VT, Custom);
2279
2283 };
2284
2285 // AVX512_FP16 scalar operations
2286 setGroup(MVT::f16);
2290 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
2292 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
2296 setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom);
2297 setOperationAction(ISD::FMINIMUM, MVT::f16, Custom);
2298 setOperationAction(ISD::FMAXIMUMNUM, MVT::f16, Custom);
2299 setOperationAction(ISD::FMINIMUMNUM, MVT::f16, Custom);
2300 setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
2302 setOperationAction(ISD::LRINT, MVT::f16, Legal);
2303 setOperationAction(ISD::LLRINT, MVT::f16, Legal);
2304
2307
2308 if (Subtarget.useAVX512Regs()) {
2309 setGroup(MVT::v32f16);
2315 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2317 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
2319 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
2322
2327 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2329 MVT::v32i16);
2330 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2332 MVT::v32i16);
2333 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2335 MVT::v32i16);
2336 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2338 MVT::v32i16);
2339
2343
2344 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2345 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2346
2347 setOperationAction(ISD::FMINIMUM, MVT::v32f16, Custom);
2348 setOperationAction(ISD::FMAXIMUM, MVT::v32f16, Custom);
2349 setOperationAction(ISD::FMINIMUMNUM, MVT::v32f16, Custom);
2350 setOperationAction(ISD::FMAXIMUMNUM, MVT::v32f16, Custom);
2351 setOperationAction(ISD::LRINT, MVT::v32f16, Legal);
2352 setOperationAction(ISD::LLRINT, MVT::v8f16, Legal);
2353 }
2354
2359
2360 if (Subtarget.hasVLX()) {
2361 setGroup(MVT::v8f16);
2362 setGroup(MVT::v16f16);
2363
2374
2377 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Custom);
2379 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
2381
2382 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2385
2389
2390 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2391 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2392 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2393 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2394
2395 // Need to custom widen these to prevent scalarization.
2396 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2397 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2398
2399 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Custom);
2400 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Custom);
2401 setOperationAction(ISD::FMINIMUMNUM, MVT::v8f16, Custom);
2402 setOperationAction(ISD::FMAXIMUMNUM, MVT::v8f16, Custom);
2403
2404 setOperationAction(ISD::FMINIMUM, MVT::v16f16, Custom);
2405 setOperationAction(ISD::FMAXIMUM, MVT::v16f16, Custom);
2406 setOperationAction(ISD::FMINIMUMNUM, MVT::v16f16, Custom);
2407 setOperationAction(ISD::FMAXIMUMNUM, MVT::v16f16, Custom);
2408 setOperationAction(ISD::LRINT, MVT::v8f16, Legal);
2409 setOperationAction(ISD::LRINT, MVT::v16f16, Legal);
2410 }
2411 }
2412
2413 if (!Subtarget.useSoftFloat() &&
2414 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2415 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2416 : &X86::VR128RegClass);
2417 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2418 : &X86::VR256RegClass);
2419 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2420 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2421 // Set the operation action Custom to do the customization later.
2424 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2425 setF16Action(VT, Expand);
2426 if (!Subtarget.hasBF16())
2432 }
2433 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2434 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2435 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2436 }
2437 setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
2438 setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
2440 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2441 }
2442
2443 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2444 Subtarget.useAVX512Regs()) {
2445 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2446 setF16Action(MVT::v32bf16, Expand);
2447 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2448 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2449 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2451 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2455 }
2456
2457 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2458 setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
2459 setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
2460 setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
2461 setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
2462 setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
2463 setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
2464 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2465 setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
2466 setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
2467 setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom);
2468 setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom);
2469 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2474 setOperationAction(ISD::FSQRT, VT, Legal);
2477 setOperationAction(ISD::FMINIMUM, VT, Custom);
2478 setOperationAction(ISD::FMAXIMUM, VT, Custom);
2479 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
2480 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
2481 }
2482 for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2485 }
2486 }
2487
2488 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2489 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2490 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2491 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2492 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2493 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2494
2495 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2496 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2497 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2498 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2499 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2500
2501 if (Subtarget.hasBWI()) {
2502 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2503 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2504 }
2505
2506 if (Subtarget.hasFP16()) {
2507 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2516 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2525 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2530 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2531 setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
2533 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
2535 }
2536 }
2537
2538 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2539 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2540 }
2541
2542 // We want to custom lower some of our intrinsics.
2546 if (!Subtarget.is64Bit()) {
2548 }
2549
2550 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2551 // handle type legalization for these operations here.
2552 //
2553 // FIXME: We really should do custom legalization for addition and
2554 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2555 // than generic legalization for 64-bit multiplication-with-overflow, though.
2556 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2557 if (VT == MVT::i64 && !Subtarget.is64Bit())
2558 continue;
2559 // Add/Sub/Mul with overflow operations are custom lowered.
2566
2567 // Support carry in as value rather than glue.
2573 }
2574
2575 // Combine sin / cos into _sincos_stret if it is available.
2576 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2577 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2578 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2579 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2580 }
2581
2582 if (Subtarget.isTargetWin64()) {
2583 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2584 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2585 setOperationAction(ISD::SREM, MVT::i128, Custom);
2586 setOperationAction(ISD::UREM, MVT::i128, Custom);
2595 }
2596
2597 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2598 // is. We should promote the value to 64-bits to solve this.
2599 // This is what the CRT headers do - `fmodf` is an inline header
2600 // function casting to f64 and calling `fmod`.
2601 if (Subtarget.is32Bit() &&
2602 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2603 // clang-format off
2604 for (ISD::NodeType Op :
2605 {ISD::FACOS, ISD::STRICT_FACOS,
2606 ISD::FASIN, ISD::STRICT_FASIN,
2607 ISD::FATAN, ISD::STRICT_FATAN,
2608 ISD::FATAN2, ISD::STRICT_FATAN2,
2609 ISD::FCEIL, ISD::STRICT_FCEIL,
2610 ISD::FCOS, ISD::STRICT_FCOS,
2611 ISD::FCOSH, ISD::STRICT_FCOSH,
2612 ISD::FEXP, ISD::STRICT_FEXP,
2613 ISD::FFLOOR, ISD::STRICT_FFLOOR,
2615 ISD::FLOG, ISD::STRICT_FLOG,
2616 ISD::FLOG10, ISD::STRICT_FLOG10,
2617 ISD::FPOW, ISD::STRICT_FPOW,
2618 ISD::FSIN, ISD::STRICT_FSIN,
2619 ISD::FSINH, ISD::STRICT_FSINH,
2620 ISD::FTAN, ISD::STRICT_FTAN,
2621 ISD::FTANH, ISD::STRICT_FTANH,
2622 // TODO: Add ISD:::STRICT_FMODF too once implemented.
2623 ISD::FMODF})
2624 if (isOperationExpand(Op, MVT::f32))
2625 setOperationAction(Op, MVT::f32, Promote);
2626 // clang-format on
2627
2628 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2629 // it, but it's just a wrapper around ldexp.
2630 if (Subtarget.isOSWindows()) {
2631 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
2632 if (isOperationExpand(Op, MVT::f32))
2633 setOperationAction(Op, MVT::f32, Promote);
2634 }
2635
2636 // We have target-specific dag combine patterns for the following nodes:
2644 ISD::BITCAST,
2647 ISD::SHL,
2648 ISD::SRA,
2649 ISD::SRL,
2650 ISD::OR,
2651 ISD::AND,
2657 ISD::ADD,
2658 ISD::FADD,
2659 ISD::FSUB,
2660 ISD::FNEG,
2661 ISD::FMA,
2663 ISD::FMINNUM,
2664 ISD::FMAXNUM,
2665 ISD::SUB,
2666 ISD::LOAD,
2667 ISD::LRINT,
2668 ISD::LLRINT,
2669 ISD::MLOAD,
2670 ISD::STORE,
2671 ISD::MSTORE,
2687 ISD::SETCC,
2688 ISD::MUL,
2689 ISD::XOR,
2690 ISD::MSCATTER,
2691 ISD::MGATHER,
2692 ISD::FP16_TO_FP,
2693 ISD::FP_EXTEND,
2700
2701 computeRegisterProperties(Subtarget.getRegisterInfo());
2702
2703 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2705 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2707 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2709
2710 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2711 // that needs to benchmarked and balanced with the potential use of vector
2712 // load/store types (PR33329, PR33914).
2715
2716 // Default loop alignment, which can be overridden by -align-loops.
2718
2719 // An out-of-order CPU can speculatively execute past a predictable branch,
2720 // but a conditional move could be stalled by an expensive earlier operation.
2721 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2722 EnableExtLdPromotion = true;
2724
2726
2727 // Default to having -disable-strictnode-mutation on
2728 IsStrictFPEnabled = true;
2729}
2730
2731// This has so far only been implemented for 64-bit MachO.
2733 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2734}
2735
2737 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2738 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2739}
2740
2742 const SDLoc &DL) const {
2743 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2744 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2745 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2746 return SDValue(Node, 0);
2747}
2748
2751 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2752 !Subtarget.hasBWI())
2753 return TypeSplitVector;
2754
2755 // Since v8f16 is legal, widen anything over v4f16.
2756 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2757 VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() &&
2758 VT.getVectorElementType() == MVT::f16)
2759 return TypeSplitVector;
2760
2761 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2762 VT.getVectorElementType() != MVT::i1)
2763 return TypeWidenVector;
2764
2766}
2767
2768FastISel *
2770 const TargetLibraryInfo *libInfo) const {
2771 return X86::createFastISel(funcInfo, libInfo);
2772}
2773
2774//===----------------------------------------------------------------------===//
2775// Other Lowering Hooks
2776//===----------------------------------------------------------------------===//
2777
2779 bool AssumeSingleUse) {
2780 if (!AssumeSingleUse && !Op.hasOneUse())
2781 return false;
2782 if (!ISD::isNormalLoad(Op.getNode()))
2783 return false;
2784
2785 // If this is an unaligned vector, make sure the target supports folding it.
2786 auto *Ld = cast<LoadSDNode>(Op.getNode());
2787 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2788 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2789 return false;
2790
2791 // TODO: If this is a non-temporal load and the target has an instruction
2792 // for it, it should not be folded. See "useNonTemporalLoad()".
2793
2794 return true;
2795}
2796
2798 const X86Subtarget &Subtarget,
2799 bool AssumeSingleUse) {
2800 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2801 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2802 return false;
2803
2804 // We can not replace a wide volatile load with a broadcast-from-memory,
2805 // because that would narrow the load, which isn't legal for volatiles.
2806 auto *Ld = cast<LoadSDNode>(Op.getNode());
2807 return !Ld->isVolatile() ||
2808 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2809}
2810
2812 if (!Op.hasOneUse())
2813 return false;
2814 // Peek through (oneuse) bitcast users
2815 SDNode *User = *Op->user_begin();
2816 while (User->getOpcode() == ISD::BITCAST) {
2817 if (!User->hasOneUse())
2818 return false;
2819 User = *User->user_begin();
2820 }
2821 return ISD::isNormalStore(User);
2822}
2823
2825 if (Op.hasOneUse()) {
2826 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2827 return (ISD::ZERO_EXTEND == Opcode);
2828 }
2829 return false;
2830}
2831
2832static bool isLogicOp(unsigned Opcode) {
2833 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2834 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2835}
2836
2837static bool isTargetShuffle(unsigned Opcode) {
2838 switch(Opcode) {
2839 default: return false;
2840 case X86ISD::BLENDI:
2841 case X86ISD::PSHUFB:
2842 case X86ISD::PSHUFD:
2843 case X86ISD::PSHUFHW:
2844 case X86ISD::PSHUFLW:
2845 case X86ISD::SHUFP:
2846 case X86ISD::INSERTPS:
2847 case X86ISD::EXTRQI:
2848 case X86ISD::INSERTQI:
2849 case X86ISD::VALIGN:
2850 case X86ISD::PALIGNR:
2851 case X86ISD::VSHLDQ:
2852 case X86ISD::VSRLDQ:
2853 case X86ISD::MOVLHPS:
2854 case X86ISD::MOVHLPS:
2855 case X86ISD::MOVSHDUP:
2856 case X86ISD::MOVSLDUP:
2857 case X86ISD::MOVDDUP:
2858 case X86ISD::MOVSS:
2859 case X86ISD::MOVSD:
2860 case X86ISD::MOVSH:
2861 case X86ISD::UNPCKL:
2862 case X86ISD::UNPCKH:
2863 case X86ISD::VBROADCAST:
2864 case X86ISD::VPERMILPI:
2865 case X86ISD::VPERMILPV:
2866 case X86ISD::VPERM2X128:
2867 case X86ISD::SHUF128:
2868 case X86ISD::VPERMIL2:
2869 case X86ISD::VPERMI:
2870 case X86ISD::VPPERM:
2871 case X86ISD::VPERMV:
2872 case X86ISD::VPERMV3:
2873 case X86ISD::VZEXT_MOVL:
2874 return true;
2875 }
2876}
2877
2878static bool isTargetShuffleVariableMask(unsigned Opcode) {
2879 switch (Opcode) {
2880 default: return false;
2881 // Target Shuffles.
2882 case X86ISD::PSHUFB:
2883 case X86ISD::VPERMILPV:
2884 case X86ISD::VPERMIL2:
2885 case X86ISD::VPPERM:
2886 case X86ISD::VPERMV:
2887 case X86ISD::VPERMV3:
2888 return true;
2889 // 'Faux' Target Shuffles.
2890 case ISD::OR:
2891 case ISD::AND:
2892 case X86ISD::ANDNP:
2893 return true;
2894 }
2895}
2896
2899 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2901 int ReturnAddrIndex = FuncInfo->getRAIndex();
2902
2903 if (ReturnAddrIndex == 0) {
2904 // Set up a frame object for the return address.
2905 unsigned SlotSize = RegInfo->getSlotSize();
2906 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2907 -(int64_t)SlotSize,
2908 false);
2909 FuncInfo->setRAIndex(ReturnAddrIndex);
2910 }
2911
2912 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2913}
2914
2916 bool HasSymbolicDisplacement) {
2917 // Offset should fit into 32 bit immediate field.
2918 if (!isInt<32>(Offset))
2919 return false;
2920
2921 // If we don't have a symbolic displacement - we don't have any extra
2922 // restrictions.
2923 if (!HasSymbolicDisplacement)
2924 return true;
2925
2926 // We can fold large offsets in the large code model because we always use
2927 // 64-bit offsets.
2928 if (CM == CodeModel::Large)
2929 return true;
2930
2931 // For kernel code model we know that all object resist in the negative half
2932 // of 32bits address space. We may not accept negative offsets, since they may
2933 // be just off and we may accept pretty large positive ones.
2934 if (CM == CodeModel::Kernel)
2935 return Offset >= 0;
2936
2937 // For other non-large code models we assume that latest small object is 16MB
2938 // before end of 31 bits boundary. We may also accept pretty large negative
2939 // constants knowing that all objects are in the positive half of address
2940 // space.
2941 return Offset < 16 * 1024 * 1024;
2942}
2943
2944/// Return true if the condition is an signed comparison operation.
2945static bool isX86CCSigned(X86::CondCode X86CC) {
2946 switch (X86CC) {
2947 default:
2948 llvm_unreachable("Invalid integer condition!");
2949 case X86::COND_E:
2950 case X86::COND_NE:
2951 case X86::COND_B:
2952 case X86::COND_A:
2953 case X86::COND_BE:
2954 case X86::COND_AE:
2955 return false;
2956 case X86::COND_G:
2957 case X86::COND_GE:
2958 case X86::COND_L:
2959 case X86::COND_LE:
2960 return true;
2961 }
2962}
2963
2965 switch (SetCCOpcode) {
2966 // clang-format off
2967 default: llvm_unreachable("Invalid integer condition!");
2968 case ISD::SETEQ: return X86::COND_E;
2969 case ISD::SETGT: return X86::COND_G;
2970 case ISD::SETGE: return X86::COND_GE;
2971 case ISD::SETLT: return X86::COND_L;
2972 case ISD::SETLE: return X86::COND_LE;
2973 case ISD::SETNE: return X86::COND_NE;
2974 case ISD::SETULT: return X86::COND_B;
2975 case ISD::SETUGT: return X86::COND_A;
2976 case ISD::SETULE: return X86::COND_BE;
2977 case ISD::SETUGE: return X86::COND_AE;
2978 // clang-format on
2979 }
2980}
2981
2982/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2983/// condition code, returning the condition code and the LHS/RHS of the
2984/// comparison to make.
2986 bool isFP, SDValue &LHS, SDValue &RHS,
2987 SelectionDAG &DAG) {
2988 if (!isFP) {
2990 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2991 // X > -1 -> X == 0, jump !sign.
2992 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2993 return X86::COND_NS;
2994 }
2995 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2996 // X < 0 -> X == 0, jump on sign.
2997 return X86::COND_S;
2998 }
2999 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
3000 // X >= 0 -> X == 0, jump on !sign.
3001 return X86::COND_NS;
3002 }
3003 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
3004 // X < 1 -> X <= 0
3005 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3006 return X86::COND_LE;
3007 }
3008 }
3009
3010 return TranslateIntegerX86CC(SetCCOpcode);
3011 }
3012
3013 // First determine if it is required or is profitable to flip the operands.
3014
3015 // If LHS is a foldable load, but RHS is not, flip the condition.
3016 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3017 !ISD::isNON_EXTLoad(RHS.getNode())) {
3018 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3019 std::swap(LHS, RHS);
3020 }
3021
3022 switch (SetCCOpcode) {
3023 default: break;
3024 case ISD::SETOLT:
3025 case ISD::SETOLE:
3026 case ISD::SETUGT:
3027 case ISD::SETUGE:
3028 std::swap(LHS, RHS);
3029 break;
3030 }
3031
3032 // On a floating point condition, the flags are set as follows:
3033 // ZF PF CF op
3034 // 0 | 0 | 0 | X > Y
3035 // 0 | 0 | 1 | X < Y
3036 // 1 | 0 | 0 | X == Y
3037 // 1 | 1 | 1 | unordered
3038 switch (SetCCOpcode) {
3039 // clang-format off
3040 default: llvm_unreachable("Condcode should be pre-legalized away");
3041 case ISD::SETUEQ:
3042 case ISD::SETEQ: return X86::COND_E;
3043 case ISD::SETOLT: // flipped
3044 case ISD::SETOGT:
3045 case ISD::SETGT: return X86::COND_A;
3046 case ISD::SETOLE: // flipped
3047 case ISD::SETOGE:
3048 case ISD::SETGE: return X86::COND_AE;
3049 case ISD::SETUGT: // flipped
3050 case ISD::SETULT:
3051 case ISD::SETLT: return X86::COND_B;
3052 case ISD::SETUGE: // flipped
3053 case ISD::SETULE:
3054 case ISD::SETLE: return X86::COND_BE;
3055 case ISD::SETONE:
3056 case ISD::SETNE: return X86::COND_NE;
3057 case ISD::SETUO: return X86::COND_P;
3058 case ISD::SETO: return X86::COND_NP;
3059 case ISD::SETOEQ:
3060 case ISD::SETUNE: return X86::COND_INVALID;
3061 // clang-format on
3062 }
3063}
3064
3065/// Is there a floating point cmov for the specific X86 condition code?
3066/// Current x86 isa includes the following FP cmov instructions:
3067/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3068static bool hasFPCMov(unsigned X86CC) {
3069 switch (X86CC) {
3070 default:
3071 return false;
3072 case X86::COND_B:
3073 case X86::COND_BE:
3074 case X86::COND_E:
3075 case X86::COND_P:
3076 case X86::COND_A:
3077 case X86::COND_AE:
3078 case X86::COND_NE:
3079 case X86::COND_NP:
3080 return true;
3081 }
3082}
3083
3084static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
3085 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
3086 VT.is512BitVector();
3087}
3088
3090 const CallInst &I,
3091 MachineFunction &MF,
3092 unsigned Intrinsic) const {
3093 Info.flags = MachineMemOperand::MONone;
3094 Info.offset = 0;
3095
3097 if (!IntrData) {
3098 switch (Intrinsic) {
3099 case Intrinsic::x86_aesenc128kl:
3100 case Intrinsic::x86_aesdec128kl:
3101 Info.opc = ISD::INTRINSIC_W_CHAIN;
3102 Info.ptrVal = I.getArgOperand(1);
3103 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3104 Info.align = Align(1);
3105 Info.flags |= MachineMemOperand::MOLoad;
3106 return true;
3107 case Intrinsic::x86_aesenc256kl:
3108 case Intrinsic::x86_aesdec256kl:
3109 Info.opc = ISD::INTRINSIC_W_CHAIN;
3110 Info.ptrVal = I.getArgOperand(1);
3111 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3112 Info.align = Align(1);
3113 Info.flags |= MachineMemOperand::MOLoad;
3114 return true;
3115 case Intrinsic::x86_aesencwide128kl:
3116 case Intrinsic::x86_aesdecwide128kl:
3117 Info.opc = ISD::INTRINSIC_W_CHAIN;
3118 Info.ptrVal = I.getArgOperand(0);
3119 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3120 Info.align = Align(1);
3121 Info.flags |= MachineMemOperand::MOLoad;
3122 return true;
3123 case Intrinsic::x86_aesencwide256kl:
3124 case Intrinsic::x86_aesdecwide256kl:
3125 Info.opc = ISD::INTRINSIC_W_CHAIN;
3126 Info.ptrVal = I.getArgOperand(0);
3127 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3128 Info.align = Align(1);
3129 Info.flags |= MachineMemOperand::MOLoad;
3130 return true;
3131 case Intrinsic::x86_cmpccxadd32:
3132 case Intrinsic::x86_cmpccxadd64:
3133 case Intrinsic::x86_atomic_bts:
3134 case Intrinsic::x86_atomic_btc:
3135 case Intrinsic::x86_atomic_btr: {
3136 Info.opc = ISD::INTRINSIC_W_CHAIN;
3137 Info.ptrVal = I.getArgOperand(0);
3138 unsigned Size = I.getType()->getScalarSizeInBits();
3139 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3140 Info.align = Align(Size);
3143 return true;
3144 }
3145 case Intrinsic::x86_atomic_bts_rm:
3146 case Intrinsic::x86_atomic_btc_rm:
3147 case Intrinsic::x86_atomic_btr_rm: {
3148 Info.opc = ISD::INTRINSIC_W_CHAIN;
3149 Info.ptrVal = I.getArgOperand(0);
3150 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3151 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3152 Info.align = Align(Size);
3155 return true;
3156 }
3157 case Intrinsic::x86_aadd32:
3158 case Intrinsic::x86_aadd64:
3159 case Intrinsic::x86_aand32:
3160 case Intrinsic::x86_aand64:
3161 case Intrinsic::x86_aor32:
3162 case Intrinsic::x86_aor64:
3163 case Intrinsic::x86_axor32:
3164 case Intrinsic::x86_axor64:
3165 case Intrinsic::x86_atomic_add_cc:
3166 case Intrinsic::x86_atomic_sub_cc:
3167 case Intrinsic::x86_atomic_or_cc:
3168 case Intrinsic::x86_atomic_and_cc:
3169 case Intrinsic::x86_atomic_xor_cc: {
3170 Info.opc = ISD::INTRINSIC_W_CHAIN;
3171 Info.ptrVal = I.getArgOperand(0);
3172 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3173 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3174 Info.align = Align(Size);
3177 return true;
3178 }
3179 }
3180 return false;
3181 }
3182
3183 switch (IntrData->Type) {
3186 case TRUNCATE_TO_MEM_VI32: {
3187 Info.opc = ISD::INTRINSIC_VOID;
3188 Info.ptrVal = I.getArgOperand(0);
3189 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3191 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3192 ScalarVT = MVT::i8;
3193 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3194 ScalarVT = MVT::i16;
3195 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3196 ScalarVT = MVT::i32;
3197
3198 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3199 Info.align = Align(1);
3200 Info.flags |= MachineMemOperand::MOStore;
3201 break;
3202 }
3203 case GATHER:
3204 case GATHER_AVX2: {
3205 Info.opc = ISD::INTRINSIC_W_CHAIN;
3206 Info.ptrVal = nullptr;
3207 MVT DataVT = MVT::getVT(I.getType());
3208 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3209 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3210 IndexVT.getVectorNumElements());
3211 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3212 Info.align = Align(1);
3213 Info.flags |= MachineMemOperand::MOLoad;
3214 break;
3215 }
3216 case SCATTER: {
3217 Info.opc = ISD::INTRINSIC_VOID;
3218 Info.ptrVal = nullptr;
3219 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3220 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3221 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3222 IndexVT.getVectorNumElements());
3223 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3224 Info.align = Align(1);
3225 Info.flags |= MachineMemOperand::MOStore;
3226 break;
3227 }
3228 default:
3229 return false;
3230 }
3231
3232 return true;
3233}
3234
3235/// Returns true if the target can instruction select the
3236/// specified FP immediate natively. If false, the legalizer will
3237/// materialize the FP immediate as a load from a constant pool.
3239 bool ForCodeSize) const {
3240 for (const APFloat &FPImm : LegalFPImmediates)
3241 if (Imm.bitwiseIsEqual(FPImm))
3242 return true;
3243 return false;
3244}
3245
3247 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
3248 std::optional<unsigned> ByteOffset) const {
3249 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3250
3251 auto PeekThroughOneUserBitcasts = [](const SDNode *N) {
3252 while (N->getOpcode() == ISD::BITCAST && N->hasOneUse())
3253 N = *N->user_begin();
3254 return N;
3255 };
3256
3257 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3258 // relocation target a movq or addq instruction: don't let the load shrink.
3259 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3260 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3261 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3262 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3263
3264 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3265 // those uses are extracted directly into a store, then the extract + store
3266 // can be store-folded, or (4) any use will be used by legal full width
3267 // instruction. Then, it's probably not worth splitting the load.
3268 EVT VT = Load->getValueType(0);
3269 if ((VT.is256BitVector() || VT.is512BitVector()) &&
3270 !SDValue(Load, 0).hasOneUse()) {
3271 bool FullWidthUse = false;
3272 bool AllExtractStores = true;
3273 for (SDUse &Use : Load->uses()) {
3274 // Skip uses of the chain value. Result 0 of the node is the load value.
3275 if (Use.getResNo() != 0)
3276 continue;
3277
3278 const SDNode *User = PeekThroughOneUserBitcasts(Use.getUser());
3279
3280 // If this use is an extract + store, it's probably not worth splitting.
3281 if (User->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
3282 all_of(User->uses(), [&](const SDUse &U) {
3283 const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());
3284 return Inner->getOpcode() == ISD::STORE;
3285 }))
3286 continue;
3287
3288 AllExtractStores = false;
3289
3290 // If any use is a full width legal/target bin op, then assume its legal
3291 // and won't split.
3292 if (isBinOp(User->getOpcode()) &&
3293 (isOperationLegal(User->getOpcode(), User->getValueType(0)) ||
3294 User->getOpcode() > ISD::BUILTIN_OP_END))
3295 FullWidthUse = true;
3296 }
3297
3298 if (AllExtractStores)
3299 return false;
3300
3301 // If we have an user that uses the full vector width, then this use is
3302 // only worth splitting if the offset isn't 0 (to avoid an
3303 // EXTRACT_SUBVECTOR) or we're loading a scalar integer.
3304 if (FullWidthUse)
3305 return (ByteOffset.value_or(0) > 0) || NewVT.isScalarInteger();
3306 }
3307
3308 return true;
3309}
3310
3311/// Returns true if it is beneficial to convert a load of a constant
3312/// to just the constant itself.
3314 Type *Ty) const {
3315 assert(Ty->isIntegerTy());
3316
3317 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3318 if (BitSize == 0 || BitSize > 64)
3319 return false;
3320 return true;
3321}
3322
3324 // If we are using XMM registers in the ABI and the condition of the select is
3325 // a floating-point compare and we have blendv or conditional move, then it is
3326 // cheaper to select instead of doing a cross-register move and creating a
3327 // load that depends on the compare result.
3328 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3329 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3330}
3331
3333 // TODO: It might be a win to ease or lift this restriction, but the generic
3334 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3335 if (VT.isVector() && Subtarget.hasAVX512())
3336 return false;
3337
3338 return true;
3339}
3340
3342 SDValue C) const {
3343 // TODO: We handle scalars using custom code, but generic combining could make
3344 // that unnecessary.
3345 APInt MulC;
3346 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3347 return false;
3348
3349 // Find the type this will be legalized too. Otherwise we might prematurely
3350 // convert this to shl+add/sub and then still have to type legalize those ops.
3351 // Another choice would be to defer the decision for illegal types until
3352 // after type legalization. But constant splat vectors of i64 can't make it
3353 // through type legalization on 32-bit targets so we would need to special
3354 // case vXi64.
3355 while (getTypeAction(Context, VT) != TypeLegal)
3356 VT = getTypeToTransformTo(Context, VT);
3357
3358 // If vector multiply is legal, assume that's faster than shl + add/sub.
3359 // Multiply is a complex op with higher latency and lower throughput in
3360 // most implementations, sub-vXi32 vector multiplies are always fast,
3361 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3362 // is always going to be slow.
3363 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3364 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3365 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3366 return false;
3367
3368 // shl+add, shl+sub, shl+add+neg
3369 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3370 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3371}
3372
3374 unsigned Index) const {
3376 return false;
3377
3378 // Mask vectors support all subregister combinations and operations that
3379 // extract half of vector.
3380 if (ResVT.getVectorElementType() == MVT::i1)
3381 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3382 (Index == ResVT.getVectorNumElements()));
3383
3384 return (Index % ResVT.getVectorNumElements()) == 0;
3385}
3386
3388 unsigned Opc = VecOp.getOpcode();
3389
3390 // Assume target opcodes can't be scalarized.
3391 // TODO - do we have any exceptions?
3392 if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
3393 return false;
3394
3395 // If the vector op is not supported, try to convert to scalar.
3396 EVT VecVT = VecOp.getValueType();
3398 return true;
3399
3400 // If the vector op is supported, but the scalar op is not, the transform may
3401 // not be worthwhile.
3402 EVT ScalarVT = VecVT.getScalarType();
3403 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3404}
3405
3407 bool) const {
3408 // TODO: Allow vectors?
3409 if (VT.isVector())
3410 return false;
3411 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3412}
3413
3415 // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
3416 // i32/i64 or can rely on BSF passthrough value.
3417 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3418 Subtarget.hasBitScanPassThrough() ||
3419 (!Ty->isVectorTy() &&
3420 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3421}
3422
3424 // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
3425 // passthrough value.
3426 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3427 Subtarget.hasBitScanPassThrough();
3428}
3429
3431 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3432 // expensive than a straight movsd. On the other hand, it's important to
3433 // shrink long double fp constant since fldt is very slow.
3434 return !Subtarget.hasSSE2() || VT == MVT::f80;
3435}
3436
3438 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3439 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3440}
3441
3443 const SelectionDAG &DAG,
3444 const MachineMemOperand &MMO) const {
3445 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3446 BitcastVT.getVectorElementType() == MVT::i1)
3447 return false;
3448
3449 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3450 return false;
3451
3452 // If both types are legal vectors, it's always ok to convert them.
3453 if (LoadVT.isVector() && BitcastVT.isVector() &&
3454 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3455 return true;
3456
3457 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3458}
3459
3461 const MachineFunction &MF) const {
3462 // Do not merge to float value size (128 bytes) if no implicit
3463 // float attribute is set.
3464 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3465
3466 if (NoFloat) {
3467 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3468 return (MemVT.getSizeInBits() <= MaxIntSize);
3469 }
3470 // Make sure we don't merge greater than our preferred vector
3471 // width.
3472 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3473 return false;
3474
3475 return true;
3476}
3477
3479 return Subtarget.hasFastLZCNT();
3480}
3481
3483 const Instruction &AndI) const {
3484 return true;
3485}
3486
3488 EVT VT = Y.getValueType();
3489
3490 if (VT.isVector())
3491 return false;
3492
3493 if (!Subtarget.hasBMI())
3494 return false;
3495
3496 // There are only 32-bit and 64-bit forms for 'andn'.
3497 if (VT != MVT::i32 && VT != MVT::i64)
3498 return false;
3499
3500 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3501}
3502
3504 EVT VT = Y.getValueType();
3505
3506 if (!VT.isVector())
3507 return hasAndNotCompare(Y);
3508
3509 // Vector.
3510
3511 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3512 return false;
3513
3514 if (VT == MVT::v4i32)
3515 return true;
3516
3517 return Subtarget.hasSSE2();
3518}
3519
3521 return X.getValueType().isScalarInteger(); // 'bt'
3522}
3523
3527 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3528 SelectionDAG &DAG) const {
3529 // Does baseline recommend not to perform the fold by default?
3531 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3532 return false;
3533 // For scalars this transform is always beneficial.
3534 if (X.getValueType().isScalarInteger())
3535 return true;
3536 // If all the shift amounts are identical, then transform is beneficial even
3537 // with rudimentary SSE2 shifts.
3538 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3539 return true;
3540 // If we have AVX2 with it's powerful shift operations, then it's also good.
3541 if (Subtarget.hasAVX2())
3542 return true;
3543 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3544 return NewShiftOpcode == ISD::SHL;
3545}
3546
3548 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3549 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3550 if (!VT.isInteger())
3551 return ShiftOpc;
3552
3553 bool PreferRotate = false;
3554 if (VT.isVector()) {
3555 // For vectors, if we have rotate instruction support, then its definetly
3556 // best. Otherwise its not clear what the best so just don't make changed.
3557 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3558 VT.getScalarType() == MVT::i64);
3559 } else {
3560 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3561 // rotate unless we have a zext mask+shr.
3562 PreferRotate = Subtarget.hasBMI2();
3563 if (!PreferRotate) {
3564 unsigned MaskBits =
3565 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3566 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3567 }
3568 }
3569
3570 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3571 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3572
3573 if (PreferRotate && MayTransformRotate)
3574 return ISD::ROTL;
3575
3576 // If vector we don't really get much benefit swapping around constants.
3577 // Maybe we could check if the DAG has the flipped node already in the
3578 // future.
3579 if (VT.isVector())
3580 return ShiftOpc;
3581
3582 // See if the beneficial to swap shift type.
3583 if (ShiftOpc == ISD::SHL) {
3584 // If the current setup has imm64 mask, then inverse will have
3585 // at least imm32 mask (or be zext i32 -> i64).
3586 if (VT == MVT::i64)
3587 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3588 : ShiftOpc;
3589
3590 // We can only benefit if req at least 7-bit for the mask. We
3591 // don't want to replace shl of 1,2,3 as they can be implemented
3592 // with lea/add.
3593 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3594 }
3595
3596 if (VT == MVT::i64)
3597 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3598 // extremely efficient.
3599 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3600
3601 // Keep small shifts as shl so we can generate add/lea.
3602 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3603 }
3604
3605 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3606 // (PreferRotate will be set in the latter case).
3607 if (PreferRotate || !MayTransformRotate || VT.isVector())
3608 return ShiftOpc;
3609
3610 // Non-vector type and we have a zext mask with SRL.
3611 return ISD::SRL;
3612}
3613
3616 const Value *Lhs,
3617 const Value *Rhs) const {
3618 using namespace llvm::PatternMatch;
3619 int BaseCost = BrMergingBaseCostThresh.getValue();
3620 // With CCMP, branches can be merged in a more efficient way.
3621 if (BaseCost >= 0 && Subtarget.hasCCMP())
3622 BaseCost += BrMergingCcmpBias;
3623 // a == b && a == c is a fast pattern on x86.
3624 if (BaseCost >= 0 && Opc == Instruction::And &&
3627 BaseCost += 1;
3628 return {BaseCost, BrMergingLikelyBias.getValue(),
3629 BrMergingUnlikelyBias.getValue()};
3630}
3631
3633 return N->getOpcode() != ISD::FP_EXTEND;
3634}
3635
3637 const SDNode *N, CombineLevel Level) const {
3638 assert(((N->getOpcode() == ISD::SHL &&
3639 N->getOperand(0).getOpcode() == ISD::SRL) ||
3640 (N->getOpcode() == ISD::SRL &&
3641 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3642 "Expected shift-shift mask");
3643 // TODO: Should we always create i64 masks? Or only folded immediates?
3644 EVT VT = N->getValueType(0);
3645 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3646 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3647 // Only fold if the shift values are equal - so it folds to AND.
3648 // TODO - we should fold if either is a non-uniform vector but we don't do
3649 // the fold for non-splats yet.
3650 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3651 }
3653}
3654
3656 EVT VT = Y.getValueType();
3657
3658 // For vectors, we don't have a preference, but we probably want a mask.
3659 if (VT.isVector())
3660 return false;
3661
3662 // 64-bit shifts on 32-bit targets produce really bad bloated code.
3663 if (VT == MVT::i64 && !Subtarget.is64Bit())
3664 return false;
3665
3666 return true;
3667}
3668
3671 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3673 !Subtarget.isOSWindows())
3676 ExpansionFactor);
3677}
3678
3680 // Any legal vector type can be splatted more efficiently than
3681 // loading/spilling from memory.
3682 return isTypeLegal(VT);
3683}
3684
3686 MVT VT = MVT::getIntegerVT(NumBits);
3687 if (isTypeLegal(VT))
3688 return VT;
3689
3690 // PMOVMSKB can handle this.
3691 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3692 return MVT::v16i8;
3693
3694 // VPMOVMSKB can handle this.
3695 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3696 return MVT::v32i8;
3697
3698 // TODO: Allow 64-bit type for 32-bit target.
3699 // TODO: 512-bit types should be allowed, but make sure that those
3700 // cases are handled in combineVectorSizedSetCCEquality().
3701
3703}
3704
3705/// Val is the undef sentinel value or equal to the specified value.
3706static bool isUndefOrEqual(int Val, int CmpVal) {
3707 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3708}
3709
3710/// Return true if every element in Mask is the undef sentinel value or equal to
3711/// the specified value.
3712static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3713 return llvm::all_of(Mask, [CmpVal](int M) {
3714 return (M == SM_SentinelUndef) || (M == CmpVal);
3715 });
3716}
3717
3718/// Return true if every element in Mask, beginning from position Pos and ending
3719/// in Pos+Size is the undef sentinel value or equal to the specified value.
3720static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3721 unsigned Size) {
3722 return llvm::all_of(Mask.slice(Pos, Size),
3723 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3724}
3725
3726/// Val is either the undef or zero sentinel value.
3727static bool isUndefOrZero(int Val) {
3728 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3729}
3730
3731/// Return true if every element in Mask, beginning from position Pos and ending
3732/// in Pos+Size is the undef sentinel value.
3733static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3734 return llvm::all_of(Mask.slice(Pos, Size),
3735 [](int M) { return M == SM_SentinelUndef; });
3736}
3737
3738/// Return true if the mask creates a vector whose lower half is undefined.
3740 unsigned NumElts = Mask.size();
3741 return isUndefInRange(Mask, 0, NumElts / 2);
3742}
3743
3744/// Return true if the mask creates a vector whose upper half is undefined.
3746 unsigned NumElts = Mask.size();
3747 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3748}
3749
3750/// Return true if Val falls within the specified range (L, H].
3751static bool isInRange(int Val, int Low, int Hi) {
3752 return (Val >= Low && Val < Hi);
3753}
3754
3755/// Return true if the value of any element in Mask falls within the specified
3756/// range (L, H].
3757static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3758 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3759}
3760
3761/// Return true if the value of any element in Mask is the zero sentinel value.
3762static bool isAnyZero(ArrayRef<int> Mask) {
3763 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3764}
3765
3766/// Return true if Val is undef or if its value falls within the
3767/// specified range (L, H].
3768static bool isUndefOrInRange(int Val, int Low, int Hi) {
3769 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3770}
3771
3772/// Return true if every element in Mask is undef or if its value
3773/// falls within the specified range (L, H].
3774static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3775 return llvm::all_of(
3776 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3777}
3778
3779/// Return true if Val is undef, zero or if its value falls within the
3780/// specified range (L, H].
3781static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3782 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3783}
3784
3785/// Return true if every element in Mask is undef, zero or if its value
3786/// falls within the specified range (L, H].
3787static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3788 return llvm::all_of(
3789 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3790}
3791
3792/// Return true if every element in Mask, is an in-place blend/select mask or is
3793/// undef.
3795 unsigned NumElts = Mask.size();
3796 for (auto [I, M] : enumerate(Mask))
3797 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3798 return false;
3799 return true;
3800}
3801
3802/// Return true if every element in Mask, beginning
3803/// from position Pos and ending in Pos + Size, falls within the specified
3804/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3805static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3806 unsigned Size, int Low, int Step = 1) {
3807 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3808 if (!isUndefOrEqual(Mask[i], Low))
3809 return false;
3810 return true;
3811}
3812
3813/// Return true if every element in Mask, beginning
3814/// from position Pos and ending in Pos+Size, falls within the specified
3815/// sequential range (Low, Low+Size], or is undef or is zero.
3817 unsigned Size, int Low,
3818 int Step = 1) {
3819 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3820 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3821 return false;
3822 return true;
3823}
3824
3825/// Return true if every element in Mask, beginning
3826/// from position Pos and ending in Pos+Size is undef or is zero.
3827static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3828 unsigned Size) {
3829 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3830}
3831
3832/// Return true if every element of a single input is referenced by the shuffle
3833/// mask. i.e. it just permutes them all.
3835 unsigned NumElts = Mask.size();
3836 APInt DemandedElts = APInt::getZero(NumElts);
3837 for (int M : Mask)
3838 if (isInRange(M, 0, NumElts))
3839 DemandedElts.setBit(M);
3840 return DemandedElts.isAllOnes();
3841}
3842
3843/// Helper function to test whether a shuffle mask could be
3844/// simplified by widening the elements being shuffled.
3845///
3846/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3847/// leaves it in an unspecified state.
3848///
3849/// NOTE: This must handle normal vector shuffle masks and *target* vector
3850/// shuffle masks. The latter have the special property of a '-2' representing
3851/// a zero-ed lane of a vector.
3853 SmallVectorImpl<int> &WidenedMask) {
3854 WidenedMask.assign(Mask.size() / 2, 0);
3855 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3856 int M0 = Mask[i];
3857 int M1 = Mask[i + 1];
3858
3859 // If both elements are undef, its trivial.
3860 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3861 WidenedMask[i / 2] = SM_SentinelUndef;
3862 continue;
3863 }
3864
3865 // Check for an undef mask and a mask value properly aligned to fit with
3866 // a pair of values. If we find such a case, use the non-undef mask's value.
3867 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3868 WidenedMask[i / 2] = M1 / 2;
3869 continue;
3870 }
3871 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3872 WidenedMask[i / 2] = M0 / 2;
3873 continue;
3874 }
3875
3876 // When zeroing, we need to spread the zeroing across both lanes to widen.
3877 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3878 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3880 WidenedMask[i / 2] = SM_SentinelZero;
3881 continue;
3882 }
3883 return false;
3884 }
3885
3886 // Finally check if the two mask values are adjacent and aligned with
3887 // a pair.
3888 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3889 WidenedMask[i / 2] = M0 / 2;
3890 continue;
3891 }
3892
3893 // Otherwise we can't safely widen the elements used in this shuffle.
3894 return false;
3895 }
3896 assert(WidenedMask.size() == Mask.size() / 2 &&
3897 "Incorrect size of mask after widening the elements!");
3898
3899 return true;
3900}
3901
3903 const APInt &Zeroable,
3904 bool V2IsZero,
3905 SmallVectorImpl<int> &WidenedMask) {
3906 // Create an alternative mask with info about zeroable elements.
3907 // Here we do not set undef elements as zeroable.
3908 SmallVector<int, 64> ZeroableMask(Mask);
3909 if (V2IsZero) {
3910 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3911 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3912 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3913 ZeroableMask[i] = SM_SentinelZero;
3914 }
3915 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3916}
3917
3919 SmallVector<int, 32> WidenedMask;
3920 return canWidenShuffleElements(Mask, WidenedMask);
3921}
3922
3923// Attempt to narrow/widen shuffle mask until it matches the target number of
3924// elements.
3925static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3926 SmallVectorImpl<int> &ScaledMask) {
3927 unsigned NumSrcElts = Mask.size();
3928 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3929 "Illegal shuffle scale factor");
3930
3931 // Narrowing is guaranteed to work.
3932 if (NumDstElts >= NumSrcElts) {
3933 int Scale = NumDstElts / NumSrcElts;
3934 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3935 return true;
3936 }
3937
3938 // We have to repeat the widening until we reach the target size, but we can
3939 // split out the first widening as it sets up ScaledMask for us.
3940 if (canWidenShuffleElements(Mask, ScaledMask)) {
3941 while (ScaledMask.size() > NumDstElts) {
3942 SmallVector<int, 16> WidenedMask;
3943 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3944 return false;
3945 ScaledMask = std::move(WidenedMask);
3946 }
3947 return true;
3948 }
3949
3950 return false;
3951}
3952
3953static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
3954 SmallVector<int, 32> ScaledMask;
3955 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
3956}
3957
3958// Helper to grow the shuffle mask for a larger value type.
3959// NOTE: This is different to scaleShuffleElements which is a same size type.
3960static void growShuffleMask(ArrayRef<int> SrcMask,
3961 SmallVectorImpl<int> &DstMask,
3962 unsigned SrcSizeInBits, unsigned DstSizeInBits) {
3963 assert(DstMask.empty() && "Expected an empty shuffle mas");
3964 assert((DstSizeInBits % SrcSizeInBits) == 0 && "Illegal shuffle scale");
3965 unsigned Scale = DstSizeInBits / SrcSizeInBits;
3966 unsigned NumSrcElts = SrcMask.size();
3967 DstMask.assign(SrcMask.begin(), SrcMask.end());
3968 for (int &M : DstMask) {
3969 if (M < 0)
3970 continue;
3971 M = (M % NumSrcElts) + ((M / NumSrcElts) * Scale * NumSrcElts);
3972 }
3973 DstMask.append((Scale - 1) * NumSrcElts, SM_SentinelUndef);
3974}
3975
3976/// Returns true if Elt is a constant zero or a floating point constant +0.0.
3978 return isNullConstant(Elt) || isNullFPConstant(Elt);
3979}
3980
3981// Build a vector of constants.
3982// Use an UNDEF node if MaskElt == -1.
3983// Split 64-bit constants in the 32-bit mode.
3985 const SDLoc &dl, bool IsMask = false) {
3986
3988 bool Split = false;
3989
3990 MVT ConstVecVT = VT;
3991 unsigned NumElts = VT.getVectorNumElements();
3992 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3993 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3994 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3995 Split = true;
3996 }
3997
3998 MVT EltVT = ConstVecVT.getVectorElementType();
3999 for (unsigned i = 0; i < NumElts; ++i) {
4000 bool IsUndef = Values[i] < 0 && IsMask;
4001 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4002 DAG.getConstant(Values[i], dl, EltVT);
4003 Ops.push_back(OpNode);
4004 if (Split)
4005 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4006 DAG.getConstant(0, dl, EltVT));
4007 }
4008 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4009 if (Split)
4010 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4011 return ConstsNode;
4012}
4013
4014static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
4015 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4016 assert(Bits.size() == Undefs.getBitWidth() &&
4017 "Unequal constant and undef arrays");
4019 bool Split = false;
4020
4021 MVT ConstVecVT = VT;
4022 unsigned NumElts = VT.getVectorNumElements();
4023 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4024 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4025 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4026 Split = true;
4027 }
4028
4029 MVT EltVT = ConstVecVT.getVectorElementType();
4030 MVT EltIntVT = EltVT.changeTypeToInteger();
4031 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4032 if (Undefs[i]) {
4033 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4034 continue;
4035 }
4036 const APInt &V = Bits[i];
4037 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4038 if (Split) {
4039 Ops.push_back(DAG.getConstant(V.extractBits(32, 0), dl, EltVT));
4040 Ops.push_back(DAG.getConstant(V.extractBits(32, 32), dl, EltVT));
4041 } else {
4042 Ops.push_back(DAG.getBitcast(EltVT, DAG.getConstant(V, dl, EltIntVT)));
4043 }
4044 }
4045
4046 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4047 return DAG.getBitcast(VT, ConstsNode);
4048}
4049
4051 SelectionDAG &DAG, const SDLoc &dl) {
4052 APInt Undefs = APInt::getZero(Bits.size());
4053 return getConstVector(Bits, Undefs, VT, DAG, dl);
4054}
4055
4056/// Returns a vector of specified type with all zero elements.
4057static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4058 SelectionDAG &DAG, const SDLoc &dl) {
4059 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4060 VT.getVectorElementType() == MVT::i1) &&
4061 "Unexpected vector type");
4062
4063 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4064 // type. This ensures they get CSE'd. But if the integer type is not
4065 // available, use a floating-point +0.0 instead.
4066 SDValue Vec;
4067 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4068 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4069 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4070 } else if (VT.isFloatingPoint() &&
4072 Vec = DAG.getConstantFP(+0.0, dl, VT);
4073 } else if (VT.getVectorElementType() == MVT::i1) {
4074 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4075 "Unexpected vector type");
4076 Vec = DAG.getConstant(0, dl, VT);
4077 } else {
4078 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4079 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4080 }
4081 return DAG.getBitcast(VT, Vec);
4082}
4083
4084// Helper to determine if the ops are all the extracted subvectors come from a
4085// single source. If we allow commute they don't have to be in order (Lo/Hi).
4086static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
4087 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4088 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4089 LHS.getValueType() != RHS.getValueType() ||
4090 LHS.getOperand(0) != RHS.getOperand(0))
4091 return SDValue();
4092
4093 SDValue Src = LHS.getOperand(0);
4094 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
4095 return SDValue();
4096
4097 unsigned NumElts = LHS.getValueType().getVectorNumElements();
4098 if ((LHS.getConstantOperandAPInt(1) == 0 &&
4099 RHS.getConstantOperandAPInt(1) == NumElts) ||
4100 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
4101 LHS.getConstantOperandAPInt(1) == NumElts))
4102 return Src;
4103
4104 return SDValue();
4105}
4106
4107static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4108 const SDLoc &dl, unsigned vectorWidth) {
4109 EVT VT = Vec.getValueType();
4110 EVT ElVT = VT.getVectorElementType();
4111 unsigned ResultNumElts =
4112 (VT.getVectorNumElements() * vectorWidth) / VT.getSizeInBits();
4113 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, ResultNumElts);
4114
4115 assert(ResultVT.getSizeInBits() == vectorWidth &&
4116 "Illegal subvector extraction");
4117
4118 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4119 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4120 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4121
4122 // This is the index of the first element of the vectorWidth-bit chunk
4123 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4124 IdxVal &= ~(ElemsPerChunk - 1);
4125
4126 // If the input is a buildvector just emit a smaller one.
4127 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4128 return DAG.getBuildVector(ResultVT, dl,
4129 Vec->ops().slice(IdxVal, ElemsPerChunk));
4130
4131 // Check if we're extracting the upper undef of a widening pattern.
4132 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
4133 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
4134 isNullConstant(Vec.getOperand(2)))
4135 return DAG.getUNDEF(ResultVT);
4136
4137 return DAG.getExtractSubvector(dl, ResultVT, Vec, IdxVal);
4138}
4139
4140/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4141/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4142/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4143/// instructions or a simple subregister reference. Idx is an index in the
4144/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4145/// lowering EXTRACT_VECTOR_ELT operations easier.
4146static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4147 SelectionDAG &DAG, const SDLoc &dl) {
4149 Vec.getValueType().is512BitVector()) &&
4150 "Unexpected vector size!");
4151 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4152}
4153
4154/// Generate a DAG to grab 256-bits from a 512-bit vector.
4155static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4156 SelectionDAG &DAG, const SDLoc &dl) {
4157 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4158 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4159}
4160
4161static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4162 SelectionDAG &DAG, const SDLoc &dl,
4163 unsigned vectorWidth) {
4164 assert((vectorWidth == 128 || vectorWidth == 256) &&
4165 "Unsupported vector width");
4166 // Inserting UNDEF is Result
4167 if (Vec.isUndef())
4168 return Result;
4169
4170 // Insert the relevant vectorWidth bits.
4171 EVT VT = Vec.getValueType();
4172 unsigned ElemsPerChunk = vectorWidth / VT.getScalarSizeInBits();
4173 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4174
4175 // This is the index of the first element of the vectorWidth-bit chunk
4176 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4177 IdxVal &= ~(ElemsPerChunk - 1);
4178 return DAG.getInsertSubvector(dl, Result, Vec, IdxVal);
4179}
4180
4181/// Generate a DAG to put 128-bits into a vector > 128 bits. This
4182/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4183/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4184/// simple superregister reference. Idx is an index in the 128 bits
4185/// we want. It need not be aligned to a 128-bit boundary. That makes
4186/// lowering INSERT_VECTOR_ELT operations easier.
4187static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4188 SelectionDAG &DAG, const SDLoc &dl) {
4189 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4190 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4191}
4192
4193/// Widen a vector to a larger size with the same scalar type, with the new
4194/// elements either zero or undef.
4195static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
4196 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4197 const SDLoc &dl) {
4198 EVT VecVT = Vec.getValueType();
4200 VecVT.getScalarType() == VT.getScalarType() &&
4201 "Unsupported vector widening type");
4202 // If the upper 128-bits of a build vector are already undef/zero, then try to
4203 // widen from the lower 128-bits.
4204 if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {
4205 unsigned NumSrcElts = VecVT.getVectorNumElements();
4206 ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4207 if (all_of(Hi, [&](SDValue V) {
4208 return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));
4209 }))
4210 Vec = extract128BitVector(Vec, 0, DAG, dl);
4211 }
4212 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4213 : DAG.getUNDEF(VT);
4214 return DAG.getInsertSubvector(dl, Res, Vec, 0);
4215}
4216
4217/// Widen a vector to a larger size with the same scalar type, with the new
4218/// elements either zero or undef.
4219static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4220 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4221 const SDLoc &dl, unsigned WideSizeInBits) {
4222 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4223 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4224 "Unsupported vector widening type");
4225 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4226 MVT SVT = Vec.getSimpleValueType().getScalarType();
4227 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4228 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4229}
4230
4231/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4232/// and bitcast with integer types.
4233static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4234 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4235 unsigned NumElts = VT.getVectorNumElements();
4236 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4237 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4238 return VT;
4239}
4240
4241/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4242/// bitcast with integer types.
4243static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4244 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4245 const SDLoc &dl) {
4246 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4247 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4248}
4249
4250// Helper function to collect subvector ops that are concatenated together,
4251// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4252// The subvectors in Ops are guaranteed to be the same type.
4254 SelectionDAG &DAG) {
4255 assert(Ops.empty() && "Expected an empty ops vector");
4256
4257 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4258 Ops.append(N->op_begin(), N->op_end());
4259 return true;
4260 }
4261
4262 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4263 SDValue Src = N->getOperand(0);
4264 SDValue Sub = N->getOperand(1);
4265 const APInt &Idx = N->getConstantOperandAPInt(2);
4266 EVT VT = Src.getValueType();
4267 EVT SubVT = Sub.getValueType();
4268
4269 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4270 // insert_subvector(undef, x, lo)
4271 if (Idx == 0 && Src.isUndef()) {
4272 Ops.push_back(Sub);
4273 Ops.push_back(DAG.getUNDEF(SubVT));
4274 return true;
4275 }
4276 if (Idx == (VT.getVectorNumElements() / 2)) {
4277 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4278 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4279 Src.getOperand(1).getValueType() == SubVT &&
4280 isNullConstant(Src.getOperand(2))) {
4281 // Attempt to recurse into inner (matching) concats.
4282 SDValue Lo = Src.getOperand(1);
4283 SDValue Hi = Sub;
4284 SmallVector<SDValue, 2> LoOps, HiOps;
4285 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4286 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4287 LoOps.size() == HiOps.size()) {
4288 Ops.append(LoOps);
4289 Ops.append(HiOps);
4290 return true;
4291 }
4292 Ops.push_back(Lo);
4293 Ops.push_back(Hi);
4294 return true;
4295 }
4296 // insert_subvector(x, extract_subvector(x, lo), hi)
4297 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4298 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4299 Ops.append(2, Sub);
4300 return true;
4301 }
4302 // insert_subvector(undef, x, hi)
4303 if (Src.isUndef()) {
4304 Ops.push_back(DAG.getUNDEF(SubVT));
4305 Ops.push_back(Sub);
4306 return true;
4307 }
4308 }
4309 }
4310 }
4311
4312 if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4313 EVT VT = N->getValueType(0);
4314 SDValue Src = N->getOperand(0);
4315 uint64_t Idx = N->getConstantOperandVal(1);
4316
4317 // Collect all the subvectors from the source vector and slice off the
4318 // extraction.
4320 if (collectConcatOps(Src.getNode(), SrcOps, DAG) &&
4321 VT.getSizeInBits() > SrcOps[0].getValueSizeInBits() &&
4322 (VT.getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 &&
4323 (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) {
4324 unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements();
4325 unsigned NumSubs = VT.getSizeInBits() / SrcOps[0].getValueSizeInBits();
4326 Ops.append(SrcOps.begin() + SubIdx, SrcOps.begin() + SubIdx + NumSubs);
4327 return true;
4328 }
4329 }
4330
4331 assert(Ops.empty() && "Expected an empty ops vector");
4332 return false;
4333}
4334
4335// Helper to check if \p V can be split into subvectors and the upper subvectors
4336// are all undef. In which case return the lower subvector.
4338 SelectionDAG &DAG) {
4339 SmallVector<SDValue> SubOps;
4340 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4341 return SDValue();
4342
4343 unsigned NumSubOps = SubOps.size();
4344 unsigned HalfNumSubOps = NumSubOps / 2;
4345 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4346
4347 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4348 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4349 return SDValue();
4350
4351 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4352 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4353 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4354}
4355
4356// Helper to check if we can access all the constituent subvectors without any
4357// extract ops.
4360 return collectConcatOps(V.getNode(), Ops, DAG);
4361}
4362
4363static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4364 const SDLoc &dl) {
4365 EVT VT = Op.getValueType();
4366 unsigned NumElems = VT.getVectorNumElements();
4367 unsigned SizeInBits = VT.getSizeInBits();
4368 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4369 "Can't split odd sized vector");
4370
4372 if (collectConcatOps(Op.getNode(), SubOps, DAG)) {
4373 assert((SubOps.size() % 2) == 0 && "Can't split odd sized vector concat");
4374 unsigned HalfOps = SubOps.size() / 2;
4375 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
4376 SmallVector<SDValue, 2> LoOps(SubOps.begin(), SubOps.begin() + HalfOps);
4377 SmallVector<SDValue, 2> HiOps(SubOps.begin() + HalfOps, SubOps.end());
4378 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps);
4379 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps);
4380 return std::make_pair(Lo, Hi);
4381 }
4382
4383 // If this is a splat value (with no-undefs) then use the lower subvector,
4384 // which should be a free extraction.
4385 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4386 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4387 return std::make_pair(Lo, Lo);
4388
4389 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4390 return std::make_pair(Lo, Hi);
4391}
4392
4393/// Break an operation into 2 half sized ops and then concatenate the results.
4395 unsigned NumOps = Op.getNumOperands();
4396 EVT VT = Op.getValueType();
4397
4398 // Extract the LHS Lo/Hi vectors
4401 for (unsigned I = 0; I != NumOps; ++I) {
4402 SDValue SrcOp = Op.getOperand(I);
4403 if (!SrcOp.getValueType().isVector()) {
4404 LoOps[I] = HiOps[I] = SrcOp;
4405 continue;
4406 }
4407 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4408 }
4409
4410 EVT LoVT, HiVT;
4411 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4412 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4413 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4414 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4415}
4416
4417/// Break an unary integer operation into 2 half sized ops and then
4418/// concatenate the result back.
4420 const SDLoc &dl) {
4421 // Make sure we only try to split 256/512-bit types to avoid creating
4422 // narrow vectors.
4423 [[maybe_unused]] EVT VT = Op.getValueType();
4424 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4425 Op.getOperand(0).getValueType().is512BitVector()) &&
4426 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4427 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4428 VT.getVectorNumElements() &&
4429 "Unexpected VTs!");
4430 return splitVectorOp(Op, DAG, dl);
4431}
4432
4433/// Break a binary integer operation into 2 half sized ops and then
4434/// concatenate the result back.
4436 const SDLoc &dl) {
4437 // Assert that all the types match.
4438 [[maybe_unused]] EVT VT = Op.getValueType();
4439 assert(Op.getOperand(0).getValueType() == VT &&
4440 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4441 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4442 return splitVectorOp(Op, DAG, dl);
4443}
4444
4445// Helper for splitting operands of an operation to legal target size and
4446// apply a function on each part.
4447// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4448// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4449// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4450// The argument Builder is a function that will be applied on each split part:
4451// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4452template <typename F>
4454 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4455 F Builder, bool CheckBWI = true,
4456 bool AllowAVX512 = true) {
4457 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4458 unsigned NumSubs = 1;
4459 if (AllowAVX512 && ((CheckBWI && Subtarget.useBWIRegs()) ||
4460 (!CheckBWI && Subtarget.useAVX512Regs()))) {
4461 if (VT.getSizeInBits() > 512) {
4462 NumSubs = VT.getSizeInBits() / 512;
4463 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4464 }
4465 } else if (Subtarget.hasAVX2()) {
4466 if (VT.getSizeInBits() > 256) {
4467 NumSubs = VT.getSizeInBits() / 256;
4468 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4469 }
4470 } else {
4471 if (VT.getSizeInBits() > 128) {
4472 NumSubs = VT.getSizeInBits() / 128;
4473 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4474 }
4475 }
4476
4477 if (NumSubs == 1)
4478 return Builder(DAG, DL, Ops);
4479
4481 for (unsigned i = 0; i != NumSubs; ++i) {
4483 for (SDValue Op : Ops) {
4484 EVT OpVT = Op.getValueType();
4485 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4486 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4487 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4488 }
4489 Subs.push_back(Builder(DAG, DL, SubOps));
4490 }
4491 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4492}
4493
4494// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4495// targets.
4496static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4498 const X86Subtarget &Subtarget) {
4499 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4500 MVT SVT = VT.getScalarType();
4501
4502 // If we have a 32/64 splatted constant, splat it to DstTy to
4503 // encourage a foldable broadcast'd operand.
4504 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4505 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4506 // AVX512 broadcasts 32/64-bit operands.
4507 // TODO: Support float once getAVX512Node is used by fp-ops.
4508 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4510 return SDValue();
4511 // If we're not widening, don't bother if we're not bitcasting.
4512 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4513 return SDValue();
4515 APInt SplatValue, SplatUndef;
4516 unsigned SplatBitSize;
4517 bool HasAnyUndefs;
4518 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4519 HasAnyUndefs, OpEltSizeInBits) &&
4520 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4521 return DAG.getConstant(SplatValue, DL, DstVT);
4522 }
4523 return SDValue();
4524 };
4525
4526 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4527
4528 MVT DstVT = VT;
4529 if (Widen)
4530 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4531
4532 // Canonicalize src operands.
4533 SmallVector<SDValue> SrcOps(Ops);
4534 for (SDValue &Op : SrcOps) {
4535 MVT OpVT = Op.getSimpleValueType();
4536 // Just pass through scalar operands.
4537 if (!OpVT.isVector())
4538 continue;
4539 assert(OpVT == VT && "Vector type mismatch");
4540
4541 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4542 Op = BroadcastOp;
4543 continue;
4544 }
4545
4546 // Just widen the subvector by inserting into an undef wide vector.
4547 if (Widen)
4548 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4549 }
4550
4551 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4552
4553 // Perform the 512-bit op then extract the bottom subvector.
4554 if (Widen)
4555 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4556 return Res;
4557}
4558
4559/// Insert i1-subvector to i1-vector.
4561 const X86Subtarget &Subtarget) {
4562
4563 SDLoc dl(Op);
4564 SDValue Vec = Op.getOperand(0);
4565 SDValue SubVec = Op.getOperand(1);
4566 SDValue Idx = Op.getOperand(2);
4567 unsigned IdxVal = Op.getConstantOperandVal(2);
4568
4569 // Inserting undef is a nop. We can just return the original vector.
4570 if (SubVec.isUndef())
4571 return Vec;
4572
4573 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4574 return Op;
4575
4576 MVT OpVT = Op.getSimpleValueType();
4577 unsigned NumElems = OpVT.getVectorNumElements();
4578 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
4579
4580 // Extend to natively supported kshift.
4581 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4582
4583 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4584 // if necessary.
4585 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4586 // May need to promote to a legal type.
4587 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4588 DAG.getConstant(0, dl, WideOpVT),
4589 SubVec, Idx);
4590 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4591 }
4592
4593 MVT SubVecVT = SubVec.getSimpleValueType();
4594 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4595 assert(IdxVal + SubVecNumElems <= NumElems &&
4596 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4597 "Unexpected index value in INSERT_SUBVECTOR");
4598
4599 SDValue Undef = DAG.getUNDEF(WideOpVT);
4600
4601 if (IdxVal == 0) {
4602 // Zero lower bits of the Vec
4603 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4604 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4605 ZeroIdx);
4606 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4607 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4608 // Merge them together, SubVec should be zero extended.
4609 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4610 DAG.getConstant(0, dl, WideOpVT),
4611 SubVec, ZeroIdx);
4612 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4613 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4614 }
4615
4616 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4617 Undef, SubVec, ZeroIdx);
4618
4619 if (Vec.isUndef()) {
4620 assert(IdxVal != 0 && "Unexpected index");
4621 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4622 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4623 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4624 }
4625
4627 assert(IdxVal != 0 && "Unexpected index");
4628 // If upper elements of Vec are known undef, then just shift into place.
4629 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4630 [](SDValue V) { return V.isUndef(); })) {
4631 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4632 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4633 } else {
4634 NumElems = WideOpVT.getVectorNumElements();
4635 unsigned ShiftLeft = NumElems - SubVecNumElems;
4636 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4637 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4638 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4639 if (ShiftRight != 0)
4640 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4641 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4642 }
4643 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4644 }
4645
4646 // Simple case when we put subvector in the upper part
4647 if (IdxVal + SubVecNumElems == NumElems) {
4648 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4649 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4650 if (SubVecNumElems * 2 == NumElems) {
4651 // Special case, use legal zero extending insert_subvector. This allows
4652 // isel to optimize when bits are known zero.
4653 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4654 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4655 DAG.getConstant(0, dl, WideOpVT),
4656 Vec, ZeroIdx);
4657 } else {
4658 // Otherwise use explicit shifts to zero the bits.
4659 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4660 Undef, Vec, ZeroIdx);
4661 NumElems = WideOpVT.getVectorNumElements();
4662 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4663 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4664 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4665 }
4666 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4667 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4668 }
4669
4670 // Inserting into the middle is more complicated.
4671
4672 NumElems = WideOpVT.getVectorNumElements();
4673
4674 // Widen the vector if needed.
4675 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4676
4677 unsigned ShiftLeft = NumElems - SubVecNumElems;
4678 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4679
4680 // Do an optimization for the most frequently used types.
4681 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4682 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4683 Mask0.flipAllBits();
4684 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4685 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4686 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4687 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4688 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4689 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4690 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4691 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4692
4693 // Reduce to original width if needed.
4694 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4695 }
4696
4697 // Clear the upper bits of the subvector and move it to its insert position.
4698 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4699 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4700 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4701 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4702
4703 // Isolate the bits below the insertion point.
4704 unsigned LowShift = NumElems - IdxVal;
4705 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4706 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4707 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4708 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4709
4710 // Isolate the bits after the last inserted bit.
4711 unsigned HighShift = IdxVal + SubVecNumElems;
4712 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4713 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4714 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4715 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4716
4717 // Now OR all 3 pieces together.
4718 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4719 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4720
4721 // Reduce to original width if needed.
4722 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4723}
4724
4726 const SDLoc &dl) {
4727 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4728 EVT SubVT = V1.getValueType();
4729 EVT SubSVT = SubVT.getScalarType();
4730 unsigned SubNumElts = SubVT.getVectorNumElements();
4731 unsigned SubVectorWidth = SubVT.getSizeInBits();
4732 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4733 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4734 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4735}
4736
4737/// Returns a vector of specified type with all bits set.
4738/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4739/// Then bitcast to their original type, ensuring they get CSE'd.
4740static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4741 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4742 "Expected a 128/256/512-bit vector type");
4743 unsigned NumElts = VT.getSizeInBits() / 32;
4744 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4745 return DAG.getBitcast(VT, Vec);
4746}
4747
4748static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4749 SDValue In, SelectionDAG &DAG) {
4750 EVT InVT = In.getValueType();
4751 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4752
4753 // Canonicalize Opcode to general extension version.
4754 switch (Opcode) {
4755 case ISD::ANY_EXTEND:
4757 Opcode = ISD::ANY_EXTEND;
4758 break;
4759 case ISD::SIGN_EXTEND:
4761 Opcode = ISD::SIGN_EXTEND;
4762 break;
4763 case ISD::ZERO_EXTEND:
4765 Opcode = ISD::ZERO_EXTEND;
4766 break;
4767 default:
4768 llvm_unreachable("Unknown extension opcode");
4769 }
4770
4771 // For 256-bit vectors, we only need the lower (128-bit) input half.
4772 // For 512-bit vectors, we only need the lower input half or quarter.
4773 if (InVT.getSizeInBits() > 128) {
4774 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4775 "Expected VTs to be the same size!");
4776 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4777 In = extractSubVector(In, 0, DAG, DL,
4778 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4779 InVT = In.getValueType();
4780 }
4781
4782 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4783 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4784
4785 return DAG.getNode(Opcode, DL, VT, In);
4786}
4787
4788// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4790 SDValue Mask, SelectionDAG &DAG) {
4791 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4792 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4793 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4794}
4795
4797 bool Lo, bool Unary) {
4798 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4799 "Illegal vector type to unpack");
4800 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4801 int NumElts = VT.getVectorNumElements();
4802 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4803 for (int i = 0; i < NumElts; ++i) {
4804 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4805 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4806 Pos += (Unary ? 0 : NumElts * (i % 2));
4807 Pos += (Lo ? 0 : NumEltsInLane / 2);
4808 Mask.push_back(Pos);
4809 }
4810}
4811
4812/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4813/// imposed by AVX and specific to the unary pattern. Example:
4814/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4815/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4817 bool Lo) {
4818 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4819 int NumElts = VT.getVectorNumElements();
4820 for (int i = 0; i < NumElts; ++i) {
4821 int Pos = i / 2;
4822 Pos += (Lo ? 0 : NumElts / 2);
4823 Mask.push_back(Pos);
4824 }
4825}
4826
4827// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4828static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4829 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4832 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4833 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4834 int M = Mask[I];
4835 if (M < 0)
4836 continue;
4837 SDValue V = (M < NumElts) ? V1 : V2;
4838 if (V.isUndef())
4839 continue;
4840 Ops[I] = V.getOperand(M % NumElts);
4841 }
4842 return DAG.getBuildVector(VT, dl, Ops);
4843 }
4844
4845 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4846}
4847
4848/// Returns a vector_shuffle node for an unpackl operation.
4849static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4850 SDValue V1, SDValue V2) {
4852 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4853 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4854}
4855
4856/// Returns a vector_shuffle node for an unpackh operation.
4857static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4858 SDValue V1, SDValue V2) {
4860 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4861 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4862}
4863
4864/// Returns a node that packs the LHS + RHS nodes together at half width.
4865/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4866/// TODO: Add subvector splitting if/when we have a need for it.
4867static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4868 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4869 bool PackHiHalf = false) {
4870 MVT OpVT = LHS.getSimpleValueType();
4871 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4872 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4873 assert(OpVT == RHS.getSimpleValueType() &&
4874 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4875 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4876 "Unexpected PACK operand types");
4877 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4878 "Unexpected PACK result type");
4879
4880 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4881 if (EltSizeInBits == 32) {
4882 SmallVector<int> PackMask;
4883 int Offset = PackHiHalf ? 1 : 0;
4884 int NumElts = VT.getVectorNumElements();
4885 for (int I = 0; I != NumElts; I += 4) {
4886 PackMask.push_back(I + Offset);
4887 PackMask.push_back(I + Offset + 2);
4888 PackMask.push_back(I + Offset + NumElts);
4889 PackMask.push_back(I + Offset + NumElts + 2);
4890 }
4891 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4892 DAG.getBitcast(VT, RHS), PackMask);
4893 }
4894
4895 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4896 if (!PackHiHalf) {
4897 if (UsePackUS &&
4898 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4899 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4900 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4901
4902 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4903 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4904 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4905 }
4906
4907 // Fallback to sign/zero extending the requested half and pack.
4908 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4909 if (UsePackUS) {
4910 if (PackHiHalf) {
4911 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4912 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4913 } else {
4914 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4915 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4916 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4917 };
4918 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4919 };
4920
4921 if (!PackHiHalf) {
4922 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4923 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4924 }
4925 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4926 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4927 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4928}
4929
4930/// Return a vector_shuffle of the specified vector of zero or undef vector.
4931/// This produces a shuffle where the low element of V2 is swizzled into the
4932/// zero/undef vector, landing at element Idx.
4933/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4935 bool IsZero,
4936 const X86Subtarget &Subtarget,
4937 SelectionDAG &DAG) {
4938 MVT VT = V2.getSimpleValueType();
4939 SDValue V1 = IsZero
4940 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4941 int NumElems = VT.getVectorNumElements();
4942 SmallVector<int, 16> MaskVec(NumElems);
4943 for (int i = 0; i != NumElems; ++i)
4944 // If this is the insertion idx, put the low elt of V2 here.
4945 MaskVec[i] = (i == Idx) ? NumElems : i;
4946 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4947}
4948
4950 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4951 Ptr.getOpcode() == X86ISD::WrapperRIP)
4952 Ptr = Ptr.getOperand(0);
4954}
4955
4956// TODO: Add support for non-zero offsets.
4959 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4960 return nullptr;
4961 return CNode->getConstVal();
4962}
4963
4965 if (!Load || !ISD::isNormalLoad(Load))
4966 return nullptr;
4967 return getTargetConstantFromBasePtr(Load->getBasePtr());
4968}
4969
4974
4975const Constant *
4977 assert(LD && "Unexpected null LoadSDNode");
4978 return getTargetConstantFromNode(LD);
4979}
4980
4982 // Do not fold (vselect not(C), X, 0s) to (vselect C, Os, X)
4983 SDValue Cond = N->getOperand(0);
4984 SDValue RHS = N->getOperand(2);
4985 EVT CondVT = Cond.getValueType();
4986 return N->getOpcode() == ISD::VSELECT && Subtarget.hasAVX512() &&
4987 CondVT.getVectorElementType() == MVT::i1 &&
4988 ISD::isBuildVectorAllZeros(RHS.getNode());
4989}
4990
4991// Extract raw constant bits from constant pools.
4992static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4993 APInt &UndefElts,
4994 SmallVectorImpl<APInt> &EltBits,
4995 bool AllowWholeUndefs = true,
4996 bool AllowPartialUndefs = false) {
4997 assert(EltBits.empty() && "Expected an empty EltBits vector");
4998
5000
5001 EVT VT = Op.getValueType();
5002 unsigned SizeInBits = VT.getSizeInBits();
5003 unsigned NumElts = SizeInBits / EltSizeInBits;
5004
5005 // Can't split constant.
5006 if ((SizeInBits % EltSizeInBits) != 0)
5007 return false;
5008
5009 // Bitcast a source array of element bits to the target size.
5010 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5011 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5012 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5013 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5014 "Constant bit sizes don't match");
5015
5016 // Don't split if we don't allow undef bits.
5017 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5018 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5019 return false;
5020
5021 // If we're already the right size, don't bother bitcasting.
5022 if (NumSrcElts == NumElts) {
5023 UndefElts = UndefSrcElts;
5024 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5025 return true;
5026 }
5027
5028 // Extract all the undef/constant element data and pack into single bitsets.
5029 APInt UndefBits(SizeInBits, 0);
5030 APInt MaskBits(SizeInBits, 0);
5031
5032 for (unsigned i = 0; i != NumSrcElts; ++i) {
5033 unsigned BitOffset = i * SrcEltSizeInBits;
5034 if (UndefSrcElts[i])
5035 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5036 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5037 }
5038
5039 // Split the undef/constant single bitset data into the target elements.
5040 UndefElts = APInt(NumElts, 0);
5041 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5042
5043 for (unsigned i = 0; i != NumElts; ++i) {
5044 unsigned BitOffset = i * EltSizeInBits;
5045 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5046
5047 // Only treat an element as UNDEF if all bits are UNDEF.
5048 if (UndefEltBits.isAllOnes()) {
5049 if (!AllowWholeUndefs)
5050 return false;
5051 UndefElts.setBit(i);
5052 continue;
5053 }
5054
5055 // If only some bits are UNDEF then treat them as zero (or bail if not
5056 // supported).
5057 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5058 return false;
5059
5060 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
5061 }
5062 return true;
5063 };
5064
5065 // Collect constant bits and insert into mask/undef bit masks.
5066 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5067 unsigned UndefBitIndex) {
5068 if (!Cst)
5069 return false;
5070 if (isa<UndefValue>(Cst)) {
5071 Undefs.setBit(UndefBitIndex);
5072 return true;
5073 }
5074 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5075 Mask = CInt->getValue();
5076 return true;
5077 }
5078 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5079 Mask = CFP->getValueAPF().bitcastToAPInt();
5080 return true;
5081 }
5082 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
5083 Type *Ty = CDS->getType();
5084 Mask = APInt::getZero(Ty->getPrimitiveSizeInBits());
5085 Type *EltTy = CDS->getElementType();
5086 bool IsInteger = EltTy->isIntegerTy();
5087 bool IsFP =
5088 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
5089 if (!IsInteger && !IsFP)
5090 return false;
5091 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
5092 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
5093 if (IsInteger)
5094 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
5095 else
5096 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
5097 I * EltBits);
5098 return true;
5099 }
5100 return false;
5101 };
5102
5103 // Handle UNDEFs.
5104 if (Op.isUndef()) {
5105 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
5106 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5107 return CastBitData(UndefSrcElts, SrcEltBits);
5108 }
5109
5110 // Extract scalar constant bits.
5111 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5112 APInt UndefSrcElts = APInt::getZero(1);
5113 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5114 return CastBitData(UndefSrcElts, SrcEltBits);
5115 }
5116 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5117 APInt UndefSrcElts = APInt::getZero(1);
5118 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5119 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5120 return CastBitData(UndefSrcElts, SrcEltBits);
5121 }
5122
5123 // Extract constant bits from build vector.
5124 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
5125 BitVector Undefs;
5126 SmallVector<APInt> SrcEltBits;
5127 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5128 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5129 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
5130 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
5131 if (Undefs[I])
5132 UndefSrcElts.setBit(I);
5133 return CastBitData(UndefSrcElts, SrcEltBits);
5134 }
5135 }
5136
5137 // Extract constant bits from constant pool vector.
5138 if (auto *Cst = getTargetConstantFromNode(Op)) {
5139 Type *CstTy = Cst->getType();
5140 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5141 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5142 return false;
5143
5144 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5145 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5146 if ((SizeInBits % SrcEltSizeInBits) != 0)
5147 return false;
5148
5149 APInt UndefSrcElts(NumSrcElts, 0);
5150 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5151 for (unsigned i = 0; i != NumSrcElts; ++i)
5152 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5153 UndefSrcElts, i))
5154 return false;
5155
5156 return CastBitData(UndefSrcElts, SrcEltBits);
5157 }
5158
5159 // Extract constant bits from a broadcasted constant pool scalar.
5160 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5161 EltSizeInBits <= VT.getScalarSizeInBits()) {
5162 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5163 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5164 return false;
5165
5166 SDValue Ptr = MemIntr->getBasePtr();
5168 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5169 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5170
5171 APInt UndefSrcElts(NumSrcElts, 0);
5172 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5173 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
5174 if (UndefSrcElts[0])
5175 UndefSrcElts.setBits(0, NumSrcElts);
5176 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
5177 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5178 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5179 return CastBitData(UndefSrcElts, SrcEltBits);
5180 }
5181 }
5182 }
5183
5184 // Extract constant bits from a subvector broadcast.
5185 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5186 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5187 SDValue Ptr = MemIntr->getBasePtr();
5188 // The source constant may be larger than the subvector broadcast,
5189 // ensure we extract the correct subvector constants.
5190 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
5191 Type *CstTy = Cst->getType();
5192 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5193 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5194 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5195 (SizeInBits % SubVecSizeInBits) != 0)
5196 return false;
5197 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5198 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5199 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5200 APInt UndefSubElts(NumSubElts, 0);
5201 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
5202 APInt(CstEltSizeInBits, 0));
5203 for (unsigned i = 0; i != NumSubElts; ++i) {
5204 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5205 UndefSubElts, i))
5206 return false;
5207 for (unsigned j = 1; j != NumSubVecs; ++j)
5208 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5209 }
5210 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
5211 UndefSubElts);
5212 return CastBitData(UndefSubElts, SubEltBits);
5213 }
5214 }
5215
5216 // Extract a rematerialized scalar constant insertion.
5217 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5218 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5219 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5220 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5221 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5222
5223 APInt UndefSrcElts(NumSrcElts, 0);
5224 SmallVector<APInt, 64> SrcEltBits;
5225 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
5226 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
5227 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5228 return CastBitData(UndefSrcElts, SrcEltBits);
5229 }
5230
5231 // Insert constant bits from a base and sub vector sources.
5232 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
5233 // If bitcasts to larger elements we might lose track of undefs - don't
5234 // allow any to be safe.
5235 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5236 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5237
5238 APInt UndefSrcElts, UndefSubElts;
5239 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
5240 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
5241 UndefSubElts, EltSubBits,
5242 AllowWholeUndefs && AllowUndefs,
5243 AllowPartialUndefs && AllowUndefs) &&
5244 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
5245 UndefSrcElts, EltSrcBits,
5246 AllowWholeUndefs && AllowUndefs,
5247 AllowPartialUndefs && AllowUndefs)) {
5248 unsigned BaseIdx = Op.getConstantOperandVal(2);
5249 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
5250 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
5251 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5252 return CastBitData(UndefSrcElts, EltSrcBits);
5253 }
5254 }
5255
5256 // Extract constant bits from a subvector's source.
5257 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5258 getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts,
5259 EltBits, AllowWholeUndefs,
5260 AllowPartialUndefs)) {
5261 EVT SrcVT = Op.getOperand(0).getValueType();
5262 unsigned NumSrcElts = SrcVT.getSizeInBits() / EltSizeInBits;
5263 unsigned NumSubElts = VT.getSizeInBits() / EltSizeInBits;
5264 unsigned BaseOfs = Op.getConstantOperandVal(1) * VT.getScalarSizeInBits();
5265 unsigned BaseIdx = BaseOfs / EltSizeInBits;
5266 assert((SrcVT.getSizeInBits() % EltSizeInBits) == 0 &&
5267 (VT.getSizeInBits() % EltSizeInBits) == 0 &&
5268 (BaseOfs % EltSizeInBits) == 0 && "Bad subvector index");
5269
5270 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5271 if ((BaseIdx + NumSubElts) != NumSrcElts)
5272 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5273 if (BaseIdx != 0)
5274 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5275 return true;
5276 }
5277
5278 // Extract constant bits from shuffle node sources.
5279 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5280 // TODO - support shuffle through bitcasts.
5281 if (EltSizeInBits != VT.getScalarSizeInBits())
5282 return false;
5283
5284 ArrayRef<int> Mask = SVN->getMask();
5285 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5286 llvm::any_of(Mask, [](int M) { return M < 0; }))
5287 return false;
5288
5289 APInt UndefElts0, UndefElts1;
5290 SmallVector<APInt, 32> EltBits0, EltBits1;
5291 if (isAnyInRange(Mask, 0, NumElts) &&
5292 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5293 UndefElts0, EltBits0, AllowWholeUndefs,
5294 AllowPartialUndefs))
5295 return false;
5296 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5297 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5298 UndefElts1, EltBits1, AllowWholeUndefs,
5299 AllowPartialUndefs))
5300 return false;
5301
5302 UndefElts = APInt::getZero(NumElts);
5303 for (int i = 0; i != (int)NumElts; ++i) {
5304 int M = Mask[i];
5305 if (M < 0) {
5306 UndefElts.setBit(i);
5307 EltBits.push_back(APInt::getZero(EltSizeInBits));
5308 } else if (M < (int)NumElts) {
5309 if (UndefElts0[M])
5310 UndefElts.setBit(i);
5311 EltBits.push_back(EltBits0[M]);
5312 } else {
5313 if (UndefElts1[M - NumElts])
5314 UndefElts.setBit(i);
5315 EltBits.push_back(EltBits1[M - NumElts]);
5316 }
5317 }
5318 return true;
5319 }
5320
5321 return false;
5322}
5323
5324namespace llvm {
5325namespace X86 {
5326bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5327 APInt UndefElts;
5328 SmallVector<APInt, 16> EltBits;
5330 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5331 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5332 int SplatIndex = -1;
5333 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5334 if (UndefElts[i])
5335 continue;
5336 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5337 SplatIndex = -1;
5338 break;
5339 }
5340 SplatIndex = i;
5341 }
5342 if (0 <= SplatIndex) {
5343 SplatVal = EltBits[SplatIndex];
5344 return true;
5345 }
5346 }
5347
5348 return false;
5349}
5350
5351int getRoundingModeX86(unsigned RM) {
5352 switch (static_cast<::llvm::RoundingMode>(RM)) {
5353 // clang-format off
5354 case ::llvm::RoundingMode::NearestTiesToEven: return X86::rmToNearest; break;
5355 case ::llvm::RoundingMode::TowardNegative: return X86::rmDownward; break;
5356 case ::llvm::RoundingMode::TowardPositive: return X86::rmUpward; break;
5357 case ::llvm::RoundingMode::TowardZero: return X86::rmTowardZero; break;
5358 default:
5359 return X86::rmInvalid; // Invalid rounding mode
5360 }
5361}
5362
5363} // namespace X86
5364} // namespace llvm
5365
5367 unsigned MaskEltSizeInBits,
5369 APInt &UndefElts) {
5370 // Extract the raw target constant bits.
5371 SmallVector<APInt, 64> EltBits;
5372 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5373 EltBits, /* AllowWholeUndefs */ true,
5374 /* AllowPartialUndefs */ false))
5375 return false;
5376
5377 // Insert the extracted elements into the mask.
5378 for (const APInt &Elt : EltBits)
5379 RawMask.push_back(Elt.getZExtValue());
5380
5381 return true;
5382}
5383
5384static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts,
5385 bool AllowUndefs) {
5386 APInt UndefElts;
5387 SmallVector<APInt, 64> EltBits;
5388 if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits,
5389 /*AllowWholeUndefs*/ AllowUndefs,
5390 /*AllowPartialUndefs*/ false))
5391 return false;
5392
5393 bool IsPow2OrUndef = true;
5394 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
5395 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
5396 return IsPow2OrUndef;
5397}
5398
5399// Helper to attempt to return a cheaper, bit-inverted version of \p V.
5401 // TODO: don't always ignore oneuse constraints.
5402 V = peekThroughBitcasts(V);
5403 EVT VT = V.getValueType();
5404
5405 // Match not(xor X, -1) -> X.
5406 if (V.getOpcode() == ISD::XOR &&
5407 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5408 isAllOnesConstant(V.getOperand(1))))
5409 return V.getOperand(0);
5410
5411 // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5412 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5413 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5414 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5415 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5416 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not,
5417 V.getOperand(1));
5418 }
5419 }
5420
5421 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5422 if (V.getOpcode() == X86ISD::PCMPGT &&
5423 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5424 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5425 V.getOperand(0).hasOneUse()) {
5426 APInt UndefElts;
5427 SmallVector<APInt> EltBits;
5428 if (getTargetConstantBitsFromNode(V.getOperand(0),
5429 V.getScalarValueSizeInBits(), UndefElts,
5430 EltBits) &&
5431 !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) {
5432 // Don't fold min_signed_value -> (min_signed_value - 1)
5433 bool MinSigned = false;
5434 for (APInt &Elt : EltBits) {
5435 MinSigned |= Elt.isMinSignedValue();
5436 Elt -= 1;
5437 }
5438 if (!MinSigned) {
5439 SDLoc DL(V);
5440 MVT VT = V.getSimpleValueType();
5441 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5442 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5443 }
5444 }
5445 }
5446
5447 // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5449 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5450 for (SDValue &CatOp : CatOps) {
5451 SDValue NotCat = IsNOT(CatOp, DAG);
5452 if (!NotCat)
5453 return SDValue();
5454 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5455 }
5456 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps);
5457 }
5458
5459 // Match not(or(not(X),not(Y))) -> and(X, Y).
5460 if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5461 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5462 // TODO: Handle cases with single NOT operand -> ANDNP
5463 if (SDValue Op1 = IsNOT(V.getOperand(1), DAG))
5464 if (SDValue Op0 = IsNOT(V.getOperand(0), DAG))
5465 return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0),
5466 DAG.getBitcast(VT, Op1));
5467 }
5468
5469 return SDValue();
5470}
5471
5472/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5473/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5474/// Note: This ignores saturation, so inputs must be checked first.
5476 bool Unary, unsigned NumStages = 1) {
5477 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5478 unsigned NumElts = VT.getVectorNumElements();
5479 unsigned NumLanes = VT.getSizeInBits() / 128;
5480 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5481 unsigned Offset = Unary ? 0 : NumElts;
5482 unsigned Repetitions = 1u << (NumStages - 1);
5483 unsigned Increment = 1u << NumStages;
5484 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5485
5486 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5487 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5488 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5489 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5490 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5491 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5492 }
5493 }
5494}
5495
5496// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5497static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5498 APInt &DemandedLHS, APInt &DemandedRHS) {
5499 int NumLanes = VT.getSizeInBits() / 128;
5500 int NumElts = DemandedElts.getBitWidth();
5501 int NumInnerElts = NumElts / 2;
5502 int NumEltsPerLane = NumElts / NumLanes;
5503 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5504
5505 DemandedLHS = APInt::getZero(NumInnerElts);
5506 DemandedRHS = APInt::getZero(NumInnerElts);
5507
5508 // Map DemandedElts to the packed operands.
5509 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5510 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5511 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5512 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5513 if (DemandedElts[OuterIdx])
5514 DemandedLHS.setBit(InnerIdx);
5515 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5516 DemandedRHS.setBit(InnerIdx);
5517 }
5518 }
5519}
5520
5521// Split the demanded elts of a HADD/HSUB node between its operands.
5522static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5523 APInt &DemandedLHS, APInt &DemandedRHS) {
5525 DemandedLHS, DemandedRHS);
5526 DemandedLHS |= DemandedLHS << 1;
5527 DemandedRHS |= DemandedRHS << 1;
5528}
5529
5530/// Calculates the shuffle mask corresponding to the target-specific opcode.
5531/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5532/// operands in \p Ops, and returns true.
5533/// Sets \p IsUnary to true if only one source is used. Note that this will set
5534/// IsUnary for shuffles which use a single input multiple times, and in those
5535/// cases it will adjust the mask to only have indices within that single input.
5536/// It is an error to call this with non-empty Mask/Ops vectors.
5537static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5539 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5540 if (!isTargetShuffle(N.getOpcode()))
5541 return false;
5542
5543 MVT VT = N.getSimpleValueType();
5544 unsigned NumElems = VT.getVectorNumElements();
5545 unsigned MaskEltSize = VT.getScalarSizeInBits();
5547 APInt RawUndefs;
5548 uint64_t ImmN;
5549
5550 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5551 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5552
5553 IsUnary = false;
5554 bool IsFakeUnary = false;
5555 switch (N.getOpcode()) {
5556 case X86ISD::BLENDI:
5557 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5558 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5559 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5560 DecodeBLENDMask(NumElems, ImmN, Mask);
5561 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5562 break;
5563 case X86ISD::SHUFP:
5564 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5565 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5566 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5567 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5568 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5569 break;
5570 case X86ISD::INSERTPS:
5571 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5572 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5573 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5574 DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false);
5575 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5576 break;
5577 case X86ISD::EXTRQI:
5578 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5579 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5580 isa<ConstantSDNode>(N.getOperand(2))) {
5581 int BitLen = N.getConstantOperandVal(1);
5582 int BitIdx = N.getConstantOperandVal(2);
5583 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5584 IsUnary = true;
5585 }
5586 break;
5587 case X86ISD::INSERTQI:
5588 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5589 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5590 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5591 isa<ConstantSDNode>(N.getOperand(3))) {
5592 int BitLen = N.getConstantOperandVal(2);
5593 int BitIdx = N.getConstantOperandVal(3);
5594 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5595 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5596 }
5597 break;
5598 case X86ISD::UNPCKH:
5599 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5600 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5601 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5602 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5603 break;
5604 case X86ISD::UNPCKL:
5605 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5606 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5607 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5608 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5609 break;
5610 case X86ISD::MOVHLPS:
5611 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5612 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5613 DecodeMOVHLPSMask(NumElems, Mask);
5614 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5615 break;
5616 case X86ISD::MOVLHPS:
5617 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5618 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5619 DecodeMOVLHPSMask(NumElems, Mask);
5620 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5621 break;
5622 case X86ISD::VALIGN:
5623 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5624 "Only 32-bit and 64-bit elements are supported!");
5625 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5626 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5627 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5628 DecodeVALIGNMask(NumElems, ImmN, Mask);
5629 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5630 Ops.push_back(N.getOperand(1));
5631 Ops.push_back(N.getOperand(0));
5632 break;
5633 case X86ISD::PALIGNR:
5634 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5635 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5636 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5637 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5638 DecodePALIGNRMask(NumElems, ImmN, Mask);
5639 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5640 Ops.push_back(N.getOperand(1));
5641 Ops.push_back(N.getOperand(0));
5642 break;
5643 case X86ISD::VSHLDQ:
5644 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5645 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5646 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5647 DecodePSLLDQMask(NumElems, ImmN, Mask);
5648 IsUnary = true;
5649 break;
5650 case X86ISD::VSRLDQ:
5651 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5652 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5653 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5654 DecodePSRLDQMask(NumElems, ImmN, Mask);
5655 IsUnary = true;
5656 break;
5657 case X86ISD::PSHUFD:
5658 case X86ISD::VPERMILPI:
5659 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5660 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5661 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5662 IsUnary = true;
5663 break;
5664 case X86ISD::PSHUFHW:
5665 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5666 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5667 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5668 IsUnary = true;
5669 break;
5670 case X86ISD::PSHUFLW:
5671 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5672 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5673 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5674 IsUnary = true;
5675 break;
5676 case X86ISD::VZEXT_MOVL:
5677 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5678 DecodeZeroMoveLowMask(NumElems, Mask);
5679 IsUnary = true;
5680 break;
5681 case X86ISD::VBROADCAST:
5682 // We only decode broadcasts of same-sized vectors, peeking through to
5683 // extracted subvectors is likely to cause hasOneUse issues with
5684 // SimplifyDemandedBits etc.
5685 if (N.getOperand(0).getValueType() == VT) {
5686 DecodeVectorBroadcast(NumElems, Mask);
5687 IsUnary = true;
5688 break;
5689 }
5690 return false;
5691 case X86ISD::VPERMILPV: {
5692 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5693 IsUnary = true;
5694 SDValue MaskNode = N.getOperand(1);
5695 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5696 RawUndefs)) {
5697 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5698 break;
5699 }
5700 return false;
5701 }
5702 case X86ISD::PSHUFB: {
5703 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5704 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5705 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5706 IsUnary = true;
5707 SDValue MaskNode = N.getOperand(1);
5708 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5709 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5710 break;
5711 }
5712 return false;
5713 }
5714 case X86ISD::VPERMI:
5715 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5716 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5717 DecodeVPERMMask(NumElems, ImmN, Mask);
5718 IsUnary = true;
5719 break;
5720 case X86ISD::MOVSS:
5721 case X86ISD::MOVSD:
5722 case X86ISD::MOVSH:
5723 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5724 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5725 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5726 break;
5727 case X86ISD::VPERM2X128:
5728 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5729 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5730 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5731 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5732 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5733 break;
5734 case X86ISD::SHUF128:
5735 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5736 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5737 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5738 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5739 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5740 break;
5741 case X86ISD::MOVSLDUP:
5742 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5743 DecodeMOVSLDUPMask(NumElems, Mask);
5744 IsUnary = true;
5745 break;
5746 case X86ISD::MOVSHDUP:
5747 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5748 DecodeMOVSHDUPMask(NumElems, Mask);
5749 IsUnary = true;
5750 break;
5751 case X86ISD::MOVDDUP:
5752 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5753 DecodeMOVDDUPMask(NumElems, Mask);
5754 IsUnary = true;
5755 break;
5756 case X86ISD::VPERMIL2: {
5757 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5758 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5759 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5760 SDValue MaskNode = N.getOperand(2);
5761 SDValue CtrlNode = N.getOperand(3);
5762 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5763 unsigned CtrlImm = CtrlOp->getZExtValue();
5764 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5765 RawUndefs)) {
5766 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5767 Mask);
5768 break;
5769 }
5770 }
5771 return false;
5772 }
5773 case X86ISD::VPPERM: {
5774 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5775 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5776 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5777 SDValue MaskNode = N.getOperand(2);
5778 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5779 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5780 break;
5781 }
5782 return false;
5783 }
5784 case X86ISD::VPERMV: {
5785 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5786 IsUnary = true;
5787 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5788 Ops.push_back(N.getOperand(1));
5789 SDValue MaskNode = N.getOperand(0);
5790 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5791 RawUndefs)) {
5792 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5793 break;
5794 }
5795 return false;
5796 }
5797 case X86ISD::VPERMV3: {
5798 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5799 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5800 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5801 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5802 Ops.push_back(N.getOperand(0));
5803 Ops.push_back(N.getOperand(2));
5804 SDValue MaskNode = N.getOperand(1);
5805 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5806 RawUndefs)) {
5807 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5808 break;
5809 }
5810 return false;
5811 }
5812 default:
5813 llvm_unreachable("unknown target shuffle node");
5814 }
5815
5816 // Empty mask indicates the decode failed.
5817 if (Mask.empty())
5818 return false;
5819
5820 // Check if we're getting a shuffle mask with zero'd elements.
5821 if (!AllowSentinelZero && isAnyZero(Mask))
5822 return false;
5823
5824 // If we have a fake unary shuffle, the shuffle mask is spread across two
5825 // inputs that are actually the same node. Re-map the mask to always point
5826 // into the first input.
5827 if (IsFakeUnary)
5828 for (int &M : Mask)
5829 if (M >= (int)Mask.size())
5830 M -= Mask.size();
5831
5832 // If we didn't already add operands in the opcode-specific code, default to
5833 // adding 1 or 2 operands starting at 0.
5834 if (Ops.empty()) {
5835 Ops.push_back(N.getOperand(0));
5836 if (!IsUnary || IsFakeUnary)
5837 Ops.push_back(N.getOperand(1));
5838 }
5839
5840 return true;
5841}
5842
5843// Wrapper for getTargetShuffleMask with InUnary;
5844static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5846 SmallVectorImpl<int> &Mask) {
5847 bool IsUnary;
5848 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5849}
5850
5851/// Compute whether each element of a shuffle is zeroable.
5852///
5853/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5854/// Either it is an undef element in the shuffle mask, the element of the input
5855/// referenced is undef, or the element of the input referenced is known to be
5856/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5857/// as many lanes with this technique as possible to simplify the remaining
5858/// shuffle.
5860 SDValue V1, SDValue V2,
5861 APInt &KnownUndef, APInt &KnownZero) {
5862 int Size = Mask.size();
5863 KnownUndef = KnownZero = APInt::getZero(Size);
5864
5865 V1 = peekThroughBitcasts(V1);
5866 V2 = peekThroughBitcasts(V2);
5867
5868 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5869 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5870
5871 int VectorSizeInBits = V1.getValueSizeInBits();
5872 int ScalarSizeInBits = VectorSizeInBits / Size;
5873 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5874
5875 for (int i = 0; i < Size; ++i) {
5876 int M = Mask[i];
5877 // Handle the easy cases.
5878 if (M < 0) {
5879 KnownUndef.setBit(i);
5880 continue;
5881 }
5882 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5883 KnownZero.setBit(i);
5884 continue;
5885 }
5886
5887 // Determine shuffle input and normalize the mask.
5888 SDValue V = M < Size ? V1 : V2;
5889 M %= Size;
5890
5891 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5892 if (V.getOpcode() != ISD::BUILD_VECTOR)
5893 continue;
5894
5895 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5896 // the (larger) source element must be UNDEF/ZERO.
5897 if ((Size % V.getNumOperands()) == 0) {
5898 int Scale = Size / V->getNumOperands();
5899 SDValue Op = V.getOperand(M / Scale);
5900 if (Op.isUndef())
5901 KnownUndef.setBit(i);
5902 if (X86::isZeroNode(Op))
5903 KnownZero.setBit(i);
5904 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5905 APInt Val = Cst->getAPIntValue();
5906 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5907 if (Val == 0)
5908 KnownZero.setBit(i);
5909 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5910 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5911 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5912 if (Val == 0)
5913 KnownZero.setBit(i);
5914 }
5915 continue;
5916 }
5917
5918 // If the BUILD_VECTOR has more elements then all the (smaller) source
5919 // elements must be UNDEF or ZERO.
5920 if ((V.getNumOperands() % Size) == 0) {
5921 int Scale = V->getNumOperands() / Size;
5922 bool AllUndef = true;
5923 bool AllZero = true;
5924 for (int j = 0; j < Scale; ++j) {
5925 SDValue Op = V.getOperand((M * Scale) + j);
5926 AllUndef &= Op.isUndef();
5927 AllZero &= X86::isZeroNode(Op);
5928 }
5929 if (AllUndef)
5930 KnownUndef.setBit(i);
5931 if (AllZero)
5932 KnownZero.setBit(i);
5933 continue;
5934 }
5935 }
5936}
5937
5938/// Decode a target shuffle mask and inputs and see if any values are
5939/// known to be undef or zero from their inputs.
5940/// Returns true if the target shuffle mask was decoded.
5941/// FIXME: Merge this with computeZeroableShuffleElements?
5944 APInt &KnownUndef, APInt &KnownZero) {
5945 bool IsUnary;
5946 if (!isTargetShuffle(N.getOpcode()))
5947 return false;
5948
5949 MVT VT = N.getSimpleValueType();
5950 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
5951 return false;
5952
5953 int Size = Mask.size();
5954 SDValue V1 = Ops[0];
5955 SDValue V2 = IsUnary ? V1 : Ops[1];
5956 KnownUndef = KnownZero = APInt::getZero(Size);
5957
5958 V1 = peekThroughBitcasts(V1);
5959 V2 = peekThroughBitcasts(V2);
5960
5961 assert((VT.getSizeInBits() % Size) == 0 &&
5962 "Illegal split of shuffle value type");
5963 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5964
5965 // Extract known constant input data.
5966 APInt UndefSrcElts[2];
5967 SmallVector<APInt, 32> SrcEltBits[2];
5968 bool IsSrcConstant[2] = {
5969 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5970 SrcEltBits[0], /*AllowWholeUndefs*/ true,
5971 /*AllowPartialUndefs*/ false),
5972 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5973 SrcEltBits[1], /*AllowWholeUndefs*/ true,
5974 /*AllowPartialUndefs*/ false)};
5975
5976 for (int i = 0; i < Size; ++i) {
5977 int M = Mask[i];
5978
5979 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5980 if (M < 0) {
5981 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5982 if (SM_SentinelUndef == M)
5983 KnownUndef.setBit(i);
5984 if (SM_SentinelZero == M)
5985 KnownZero.setBit(i);
5986 continue;
5987 }
5988
5989 // Determine shuffle input and normalize the mask.
5990 unsigned SrcIdx = M / Size;
5991 SDValue V = M < Size ? V1 : V2;
5992 M %= Size;
5993
5994 // We are referencing an UNDEF input.
5995 if (V.isUndef()) {
5996 KnownUndef.setBit(i);
5997 continue;
5998 }
5999
6000 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
6001 // TODO: We currently only set UNDEF for integer types - floats use the same
6002 // registers as vectors and many of the scalar folded loads rely on the
6003 // SCALAR_TO_VECTOR pattern.
6004 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6005 (Size % V.getValueType().getVectorNumElements()) == 0) {
6006 int Scale = Size / V.getValueType().getVectorNumElements();
6007 int Idx = M / Scale;
6008 if (Idx != 0 && !VT.isFloatingPoint())
6009 KnownUndef.setBit(i);
6010 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
6011 KnownZero.setBit(i);
6012 continue;
6013 }
6014
6015 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
6016 // base vectors.
6017 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
6018 SDValue Vec = V.getOperand(0);
6019 int NumVecElts = Vec.getValueType().getVectorNumElements();
6020 if (Vec.isUndef() && Size == NumVecElts) {
6021 int Idx = V.getConstantOperandVal(2);
6022 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
6023 if (M < Idx || (Idx + NumSubElts) <= M)
6024 KnownUndef.setBit(i);
6025 }
6026 continue;
6027 }
6028
6029 // Attempt to extract from the source's constant bits.
6030 if (IsSrcConstant[SrcIdx]) {
6031 if (UndefSrcElts[SrcIdx][M])
6032 KnownUndef.setBit(i);
6033 else if (SrcEltBits[SrcIdx][M] == 0)
6034 KnownZero.setBit(i);
6035 }
6036 }
6037
6038 assert(VT.getVectorNumElements() == (unsigned)Size &&
6039 "Different mask size from vector size!");
6040 return true;
6041}
6042
6043// Replace target shuffle mask elements with known undef/zero sentinels.
6045 const APInt &KnownUndef,
6046 const APInt &KnownZero,
6047 bool ResolveKnownZeros= true) {
6048 unsigned NumElts = Mask.size();
6049 assert(KnownUndef.getBitWidth() == NumElts &&
6050 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
6051
6052 for (unsigned i = 0; i != NumElts; ++i) {
6053 if (KnownUndef[i])
6054 Mask[i] = SM_SentinelUndef;
6055 else if (ResolveKnownZeros && KnownZero[i])
6056 Mask[i] = SM_SentinelZero;
6057 }
6058}
6059
6060// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
6062 APInt &KnownUndef,
6063 APInt &KnownZero) {
6064 unsigned NumElts = Mask.size();
6065 KnownUndef = KnownZero = APInt::getZero(NumElts);
6066
6067 for (unsigned i = 0; i != NumElts; ++i) {
6068 int M = Mask[i];
6069 if (SM_SentinelUndef == M)
6070 KnownUndef.setBit(i);
6071 if (SM_SentinelZero == M)
6072 KnownZero.setBit(i);
6073 }
6074}
6075
6076// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
6078 SDValue Cond, bool IsBLENDV = false) {
6079 EVT CondVT = Cond.getValueType();
6080 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
6081 unsigned NumElts = CondVT.getVectorNumElements();
6082
6083 APInt UndefElts;
6084 SmallVector<APInt, 32> EltBits;
6085 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
6086 /*AllowWholeUndefs*/ true,
6087 /*AllowPartialUndefs*/ false))
6088 return false;
6089
6090 Mask.resize(NumElts, SM_SentinelUndef);
6091
6092 for (int i = 0; i != (int)NumElts; ++i) {
6093 Mask[i] = i;
6094 // Arbitrarily choose from the 2nd operand if the select condition element
6095 // is undef.
6096 // TODO: Can we do better by matching patterns such as even/odd?
6097 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
6098 (IsBLENDV && EltBits[i].isNonNegative()))
6099 Mask[i] += NumElts;
6100 }
6101
6102 return true;
6103}
6104
6105// Forward declaration (for getFauxShuffleMask recursive check).
6106static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6109 const SelectionDAG &DAG, unsigned Depth,
6110 bool ResolveKnownElts);
6111
6112// Attempt to decode ops that could be represented as a shuffle mask.
6113// The decoded shuffle mask may contain a different number of elements to the
6114// destination value type.
6115// TODO: Merge into getTargetShuffleInputs()
6116static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
6119 const SelectionDAG &DAG, unsigned Depth,
6120 bool ResolveKnownElts) {
6121 Mask.clear();
6122 Ops.clear();
6123
6124 MVT VT = N.getSimpleValueType();
6125 unsigned NumElts = VT.getVectorNumElements();
6126 unsigned NumSizeInBits = VT.getSizeInBits();
6127 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6128 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6129 return false;
6130 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
6131 unsigned NumSizeInBytes = NumSizeInBits / 8;
6132 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6133
6134 unsigned Opcode = N.getOpcode();
6135 switch (Opcode) {
6136 case ISD::VECTOR_SHUFFLE: {
6137 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6138 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6139 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6140 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6141 Ops.push_back(N.getOperand(0));
6142 Ops.push_back(N.getOperand(1));
6143 return true;
6144 }
6145 return false;
6146 }
6147 case ISD::AND:
6148 case X86ISD::ANDNP: {
6149 // Attempt to decode as a per-byte mask.
6150 APInt UndefElts;
6151 SmallVector<APInt, 32> EltBits;
6152 SDValue N0 = N.getOperand(0);
6153 SDValue N1 = N.getOperand(1);
6154 bool IsAndN = (X86ISD::ANDNP == Opcode);
6155 uint64_t ZeroMask = IsAndN ? 255 : 0;
6156 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
6157 /*AllowWholeUndefs*/ false,
6158 /*AllowPartialUndefs*/ false))
6159 return false;
6160 // We can't assume an undef src element gives an undef dst - the other src
6161 // might be zero.
6162 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
6163 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6164 const APInt &ByteBits = EltBits[i];
6165 if (ByteBits != 0 && ByteBits != 255)
6166 return false;
6167 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6168 }
6169 Ops.push_back(IsAndN ? N1 : N0);
6170 return true;
6171 }
6172 case ISD::OR: {
6173 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
6174 // is a valid shuffle index.
6175 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
6176 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
6177 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
6178 return false;
6179
6180 SmallVector<int, 64> SrcMask0, SrcMask1;
6181 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
6184 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
6185 Depth + 1, true) ||
6186 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
6187 Depth + 1, true))
6188 return false;
6189
6190 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
6191 SmallVector<int, 64> Mask0, Mask1;
6192 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
6193 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
6194 for (int i = 0; i != (int)MaskSize; ++i) {
6195 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
6196 // loops converting between OR and BLEND shuffles due to
6197 // canWidenShuffleElements merging away undef elements, meaning we
6198 // fail to recognise the OR as the undef element isn't known zero.
6199 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
6200 Mask.push_back(SM_SentinelZero);
6201 else if (Mask1[i] == SM_SentinelZero)
6202 Mask.push_back(i);
6203 else if (Mask0[i] == SM_SentinelZero)
6204 Mask.push_back(i + MaskSize);
6205 else
6206 return false;
6207 }
6208 Ops.push_back(N.getOperand(0));
6209 Ops.push_back(N.getOperand(1));
6210 return true;
6211 }
6212 case ISD::CONCAT_VECTORS: {
6213 // Limit this to vXi64 vector cases to make the most of cross lane shuffles.
6214 unsigned NumSubElts = N.getOperand(0).getValueType().getVectorNumElements();
6215 if (NumBitsPerElt == 64) {
6216 for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
6217 for (unsigned M = 0; M != NumSubElts; ++M)
6218 Mask.push_back((I * NumElts) + M);
6219 Ops.push_back(N.getOperand(I));
6220 }
6221 return true;
6222 }
6223 return false;
6224 }
6225 case ISD::INSERT_SUBVECTOR: {
6226 SDValue Src = N.getOperand(0);
6227 SDValue Sub = N.getOperand(1);
6228 EVT SubVT = Sub.getValueType();
6229 unsigned NumSubElts = SubVT.getVectorNumElements();
6230 uint64_t InsertIdx = N.getConstantOperandVal(2);
6231 // Subvector isn't demanded - just return the base vector.
6232 if (DemandedElts.extractBits(NumSubElts, InsertIdx) == 0) {
6233 Mask.resize(NumElts);
6234 std::iota(Mask.begin(), Mask.end(), 0);
6235 Ops.push_back(Src);
6236 return true;
6237 }
6238 // Handle CONCAT(SUB0, SUB1).
6239 // Limit to vXi64/splat cases to make the most of cross lane shuffles.
6240 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6241 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6242 Src.getOperand(0).isUndef() &&
6243 Src.getOperand(1).getValueType() == SubVT &&
6244 Src.getConstantOperandVal(2) == 0 &&
6245 (NumBitsPerElt == 64 || Src.getOperand(1) == Sub) &&
6246 SDNode::areOnlyUsersOf({N.getNode(), Src.getNode()}, Sub.getNode())) {
6247 Mask.resize(NumElts);
6248 std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0);
6249 std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts);
6250 Ops.push_back(Src.getOperand(1));
6251 Ops.push_back(Sub);
6252 return true;
6253 }
6254 if (!N->isOnlyUserOf(Sub.getNode()))
6255 return false;
6256
6257 SmallVector<int, 64> SubMask;
6258 SmallVector<SDValue, 2> SubInputs;
6260 EVT SubSrcVT = SubSrc.getValueType();
6261 if (!SubSrcVT.isVector())
6262 return false;
6263
6264 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
6265 if (SubSrc.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6266 SubSrc.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6267 uint64_t ExtractIdx = SubSrc.getConstantOperandVal(1);
6268 SDValue SubSrcSrc = SubSrc.getOperand(0);
6269 unsigned NumSubSrcSrcElts =
6270 SubSrcSrc.getValueType().getVectorNumElements();
6271 unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts);
6272 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 &&
6273 "Subvector valuetype mismatch");
6274 InsertIdx *= (MaxElts / NumElts);
6275 ExtractIdx *= (MaxElts / NumSubSrcSrcElts);
6276 NumSubElts *= (MaxElts / NumElts);
6277 bool SrcIsUndef = Src.isUndef();
6278 for (int i = 0; i != (int)MaxElts; ++i)
6279 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
6280 for (int i = 0; i != (int)NumSubElts; ++i)
6281 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6282 if (!SrcIsUndef)
6283 Ops.push_back(Src);
6284 Ops.push_back(SubSrcSrc);
6285 return true;
6286 }
6287
6288 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
6289 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
6290 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
6291 Depth + 1, ResolveKnownElts))
6292 return false;
6293
6294 // Subvector shuffle inputs must not be larger than the subvector.
6295 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
6296 return SubVT.getFixedSizeInBits() <
6297 SubInput.getValueSizeInBits().getFixedValue();
6298 }))
6299 return false;
6300
6301 if (SubMask.size() != NumSubElts) {
6302 assert(((SubMask.size() % NumSubElts) == 0 ||
6303 (NumSubElts % SubMask.size()) == 0) &&
6304 "Illegal submask scale");
6305 if ((NumSubElts % SubMask.size()) == 0) {
6306 int Scale = NumSubElts / SubMask.size();
6307 SmallVector<int, 64> ScaledSubMask;
6308 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
6309 SubMask = ScaledSubMask;
6310 } else {
6311 int Scale = SubMask.size() / NumSubElts;
6312 NumSubElts = SubMask.size();
6313 NumElts *= Scale;
6314 InsertIdx *= Scale;
6315 }
6316 }
6317 Ops.push_back(Src);
6318 Ops.append(SubInputs.begin(), SubInputs.end());
6319 if (ISD::isBuildVectorAllZeros(Src.getNode()))
6320 Mask.append(NumElts, SM_SentinelZero);
6321 else
6322 for (int i = 0; i != (int)NumElts; ++i)
6323 Mask.push_back(i);
6324 for (int i = 0; i != (int)NumSubElts; ++i) {
6325 int M = SubMask[i];
6326 if (0 <= M) {
6327 int InputIdx = M / NumSubElts;
6328 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6329 }
6330 Mask[i + InsertIdx] = M;
6331 }
6332 return true;
6333 }
6334 case X86ISD::PINSRB:
6335 case X86ISD::PINSRW:
6338 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6339 // vector, for matching src/dst vector types.
6340 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6341
6342 unsigned DstIdx = 0;
6343 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6344 // Check we have an in-range constant insertion index.
6345 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6346 N.getConstantOperandAPInt(2).uge(NumElts))
6347 return false;
6348 DstIdx = N.getConstantOperandVal(2);
6349
6350 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6351 if (X86::isZeroNode(Scl)) {
6352 Ops.push_back(N.getOperand(0));
6353 for (unsigned i = 0; i != NumElts; ++i)
6354 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6355 return true;
6356 }
6357 }
6358
6359 // Peek through trunc/aext/zext/bitcast.
6360 // TODO: aext shouldn't require SM_SentinelZero padding.
6361 // TODO: handle shift of scalars.
6362 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6363 while (Scl.getOpcode() == ISD::TRUNCATE ||
6364 Scl.getOpcode() == ISD::ANY_EXTEND ||
6365 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6366 (Scl.getOpcode() == ISD::BITCAST &&
6369 Scl = Scl.getOperand(0);
6370 MinBitsPerElt =
6371 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6372 }
6373 if ((MinBitsPerElt % 8) != 0)
6374 return false;
6375
6376 // Attempt to find the source vector the scalar was extracted from.
6377 SDValue SrcExtract;
6378 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6379 Scl.getOpcode() == X86ISD::PEXTRW ||
6380 Scl.getOpcode() == X86ISD::PEXTRB) &&
6381 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6382 SrcExtract = Scl;
6383 }
6384 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6385 return false;
6386
6387 SDValue SrcVec = SrcExtract.getOperand(0);
6388 EVT SrcVT = SrcVec.getValueType();
6389 if (!SrcVT.getScalarType().isByteSized())
6390 return false;
6391 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6392 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6393 unsigned DstByte = DstIdx * NumBytesPerElt;
6394 MinBitsPerElt =
6395 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6396
6397 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6398 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6399 Ops.push_back(SrcVec);
6400 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6401 } else {
6402 Ops.push_back(SrcVec);
6403 Ops.push_back(N.getOperand(0));
6404 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6405 Mask.push_back(NumSizeInBytes + i);
6406 }
6407
6408 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6409 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6410 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6411 Mask[DstByte + i] = SrcByte + i;
6412 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6413 Mask[DstByte + i] = SM_SentinelZero;
6414 return true;
6415 }
6416 case X86ISD::PACKSS:
6417 case X86ISD::PACKUS: {
6418 SDValue N0 = N.getOperand(0);
6419 SDValue N1 = N.getOperand(1);
6420 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6421 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6422 "Unexpected input value type");
6423
6424 APInt EltsLHS, EltsRHS;
6425 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6426
6427 // If we know input saturation won't happen (or we don't care for particular
6428 // lanes), we can treat this as a truncation shuffle.
6429 bool Offset0 = false, Offset1 = false;
6430 if (Opcode == X86ISD::PACKSS) {
6431 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6432 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6433 (!(N1.isUndef() || EltsRHS.isZero()) &&
6434 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6435 return false;
6436 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6437 // PACKSS then it was likely being used for sign-extension for a
6438 // truncation, so just peek through and adjust the mask accordingly.
6439 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6440 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6441 Offset0 = true;
6442 N0 = N0.getOperand(0);
6443 }
6444 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6445 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6446 Offset1 = true;
6447 N1 = N1.getOperand(0);
6448 }
6449 } else {
6450 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6451 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6452 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6453 (!(N1.isUndef() || EltsRHS.isZero()) &&
6454 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6455 return false;
6456 }
6457
6458 bool IsUnary = (N0 == N1);
6459
6460 Ops.push_back(N0);
6461 if (!IsUnary)
6462 Ops.push_back(N1);
6463
6464 createPackShuffleMask(VT, Mask, IsUnary);
6465
6466 if (Offset0 || Offset1) {
6467 for (int &M : Mask)
6468 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6469 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6470 ++M;
6471 }
6472 return true;
6473 }
6474 case ISD::VSELECT:
6475 case X86ISD::BLENDV: {
6476 SDValue Cond = N.getOperand(0);
6477 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6478 Ops.push_back(N.getOperand(1));
6479 Ops.push_back(N.getOperand(2));
6480 return true;
6481 }
6482 return false;
6483 }
6484 case X86ISD::VTRUNC: {
6485 SDValue Src = N.getOperand(0);
6486 EVT SrcVT = Src.getValueType();
6487 if (SrcVT.getSizeInBits() != NumSizeInBits)
6488 return false;
6489 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6490 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6491 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6492 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6493 for (unsigned i = 0; i != NumSrcElts; ++i)
6494 Mask.push_back(i * Scale);
6495 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6496 Ops.push_back(Src);
6497 return true;
6498 }
6499 case ISD::SHL:
6500 case ISD::SRL: {
6501 APInt UndefElts;
6502 SmallVector<APInt, 32> EltBits;
6503 if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt,
6504 UndefElts, EltBits,
6505 /*AllowWholeUndefs*/ true,
6506 /*AllowPartialUndefs*/ false))
6507 return false;
6508
6509 // We can only decode 'whole byte' bit shifts as shuffles.
6510 for (unsigned I = 0; I != NumElts; ++I)
6511 if (DemandedElts[I] && !UndefElts[I] &&
6512 (EltBits[I].urem(8) != 0 || EltBits[I].uge(NumBitsPerElt)))
6513 return false;
6514
6515 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6516 Ops.push_back(N.getOperand(0));
6517
6518 for (unsigned I = 0; I != NumElts; ++I) {
6519 if (!DemandedElts[I] || UndefElts[I])
6520 continue;
6521 unsigned ByteShift = EltBits[I].getZExtValue() / 8;
6522 unsigned Lo = I * NumBytesPerElt;
6523 unsigned Hi = Lo + NumBytesPerElt;
6524 // Clear mask to all zeros and insert the shifted byte indices.
6525 std::fill(Mask.begin() + Lo, Mask.begin() + Hi, SM_SentinelZero);
6526 if (ISD::SHL == Opcode)
6527 std::iota(Mask.begin() + Lo + ByteShift, Mask.begin() + Hi, Lo);
6528 else
6529 std::iota(Mask.begin() + Lo, Mask.begin() + Hi - ByteShift,
6530 Lo + ByteShift);
6531 }
6532 return true;
6533 }
6534 case X86ISD::VSHLI:
6535 case X86ISD::VSRLI: {
6536 uint64_t ShiftVal = N.getConstantOperandVal(1);
6537 // Out of range bit shifts are guaranteed to be zero.
6538 if (NumBitsPerElt <= ShiftVal) {
6539 Mask.append(NumElts, SM_SentinelZero);
6540 return true;
6541 }
6542
6543 // We can only decode 'whole byte' bit shifts as shuffles.
6544 if ((ShiftVal % 8) != 0)
6545 break;
6546
6547 uint64_t ByteShift = ShiftVal / 8;
6548 Ops.push_back(N.getOperand(0));
6549
6550 // Clear mask to all zeros and insert the shifted byte indices.
6551 Mask.append(NumSizeInBytes, SM_SentinelZero);
6552
6553 if (X86ISD::VSHLI == Opcode) {
6554 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6555 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6556 Mask[i + j] = i + j - ByteShift;
6557 } else {
6558 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6559 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6560 Mask[i + j - ByteShift] = i + j;
6561 }
6562 return true;
6563 }
6564 case X86ISD::VROTLI:
6565 case X86ISD::VROTRI: {
6566 // We can only decode 'whole byte' bit rotates as shuffles.
6567 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6568 if ((RotateVal % 8) != 0)
6569 return false;
6570 Ops.push_back(N.getOperand(0));
6571 int Offset = RotateVal / 8;
6572 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6573 for (int i = 0; i != (int)NumElts; ++i) {
6574 int BaseIdx = i * NumBytesPerElt;
6575 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6576 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6577 }
6578 }
6579 return true;
6580 }
6581 case X86ISD::VBROADCAST: {
6582 SDValue Src = N.getOperand(0);
6583 if (!Src.getSimpleValueType().isVector()) {
6584 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6585 !isNullConstant(Src.getOperand(1)) ||
6586 Src.getOperand(0).getValueType().getScalarType() !=
6587 VT.getScalarType())
6588 return false;
6589 Src = Src.getOperand(0);
6590 }
6591 Ops.push_back(Src);
6592 Mask.append(NumElts, 0);
6593 return true;
6594 }
6596 SDValue Src = N.getOperand(0);
6597 EVT SrcVT = Src.getValueType();
6598 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6599
6600 // Extended source must be a simple vector.
6601 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6602 (NumBitsPerSrcElt % 8) != 0)
6603 return false;
6604
6605 // We can only handle all-signbits extensions.
6606 APInt DemandedSrcElts =
6607 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6608 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6609 return false;
6610
6611 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6612 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6613 for (unsigned I = 0; I != NumElts; ++I)
6614 Mask.append(Scale, I);
6615 Ops.push_back(Src);
6616 return true;
6617 }
6618 case ISD::ZERO_EXTEND:
6619 case ISD::ANY_EXTEND:
6622 SDValue Src = N.getOperand(0);
6623 EVT SrcVT = Src.getValueType();
6624
6625 // Extended source must be a simple vector.
6626 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6627 (SrcVT.getScalarSizeInBits() % 8) != 0)
6628 return false;
6629
6630 bool IsAnyExtend =
6631 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6632 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6633 IsAnyExtend, Mask);
6634 Ops.push_back(Src);
6635 return true;
6636 }
6637 }
6638
6639 return false;
6640}
6641
6642/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6644 SmallVectorImpl<int> &Mask) {
6645 int MaskWidth = Mask.size();
6646 SmallVector<SDValue, 16> UsedInputs;
6647 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6648 int lo = UsedInputs.size() * MaskWidth;
6649 int hi = lo + MaskWidth;
6650
6651 // Strip UNDEF input usage.
6652 if (Inputs[i].isUndef())
6653 for (int &M : Mask)
6654 if ((lo <= M) && (M < hi))
6655 M = SM_SentinelUndef;
6656
6657 // Check for unused inputs.
6658 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6659 for (int &M : Mask)
6660 if (lo <= M)
6661 M -= MaskWidth;
6662 continue;
6663 }
6664
6665 // Check for repeated inputs.
6666 bool IsRepeat = false;
6667 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6668 if (UsedInputs[j] != Inputs[i])
6669 continue;
6670 for (int &M : Mask)
6671 if (lo <= M)
6672 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6673 IsRepeat = true;
6674 break;
6675 }
6676 if (IsRepeat)
6677 continue;
6678
6679 UsedInputs.push_back(Inputs[i]);
6680 }
6681 Inputs = UsedInputs;
6682}
6683
6684/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6685/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6686/// Returns true if the target shuffle mask was decoded.
6687static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6690 APInt &KnownUndef, APInt &KnownZero,
6691 const SelectionDAG &DAG, unsigned Depth,
6692 bool ResolveKnownElts) {
6694 return false; // Limit search depth.
6695
6696 EVT VT = Op.getValueType();
6697 if (!VT.isSimple() || !VT.isVector())
6698 return false;
6699
6700 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6701 if (ResolveKnownElts)
6702 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6703 return true;
6704 }
6705 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6706 ResolveKnownElts)) {
6707 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6708 return true;
6709 }
6710 return false;
6711}
6712
6713static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6716 const SelectionDAG &DAG, unsigned Depth,
6717 bool ResolveKnownElts) {
6718 APInt KnownUndef, KnownZero;
6719 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6720 KnownZero, DAG, Depth, ResolveKnownElts);
6721}
6722
6725 const SelectionDAG &DAG, unsigned Depth = 0,
6726 bool ResolveKnownElts = true) {
6727 EVT VT = Op.getValueType();
6728 if (!VT.isSimple() || !VT.isVector())
6729 return false;
6730
6731 unsigned NumElts = Op.getValueType().getVectorNumElements();
6732 APInt DemandedElts = APInt::getAllOnes(NumElts);
6733 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6734 ResolveKnownElts);
6735}
6736
6737// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6738static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6739 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6740 SelectionDAG &DAG) {
6741 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6742 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6743 "Unknown broadcast load type");
6744
6745 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6746 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6747 return SDValue();
6748
6751 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6752 SDValue Ops[] = {Mem->getChain(), Ptr};
6753 SDValue BcstLd = DAG.getMemIntrinsicNode(
6754 Opcode, DL, Tys, Ops, MemVT,
6756 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6757 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6758 return BcstLd;
6759}
6760
6761/// Returns the scalar element that will make up the i'th
6762/// element of the result of the vector shuffle.
6763static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
6764 SelectionDAG &DAG, unsigned Depth) {
6766 return SDValue(); // Limit search depth.
6767
6768 EVT VT = Op.getValueType();
6769 unsigned Opcode = Op.getOpcode();
6770 unsigned NumElems = VT.getVectorNumElements();
6771
6772 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6773 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6774 int Elt = SV->getMaskElt(Index);
6775
6776 if (Elt < 0)
6777 return DAG.getUNDEF(VT.getVectorElementType());
6778
6779 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6780 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6781 }
6782
6783 // Recurse into target specific vector shuffles to find scalars.
6784 if (isTargetShuffle(Opcode)) {
6785 MVT ShufVT = VT.getSimpleVT();
6786 MVT ShufSVT = ShufVT.getVectorElementType();
6787 int NumElems = (int)ShufVT.getVectorNumElements();
6788 SmallVector<int, 16> ShuffleMask;
6790 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6791 return SDValue();
6792
6793 int Elt = ShuffleMask[Index];
6794 if (Elt == SM_SentinelZero)
6795 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6796 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6797 if (Elt == SM_SentinelUndef)
6798 return DAG.getUNDEF(ShufSVT);
6799
6800 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6801 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6802 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6803 }
6804
6805 // Recurse into insert_subvector base/sub vector to find scalars.
6806 if (Opcode == ISD::INSERT_SUBVECTOR) {
6807 SDValue Vec = Op.getOperand(0);
6808 SDValue Sub = Op.getOperand(1);
6809 uint64_t SubIdx = Op.getConstantOperandVal(2);
6810 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6811
6812 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6813 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6814 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6815 }
6816
6817 // Recurse into concat_vectors sub vector to find scalars.
6818 if (Opcode == ISD::CONCAT_VECTORS) {
6819 EVT SubVT = Op.getOperand(0).getValueType();
6820 unsigned NumSubElts = SubVT.getVectorNumElements();
6821 uint64_t SubIdx = Index / NumSubElts;
6822 uint64_t SubElt = Index % NumSubElts;
6823 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6824 }
6825
6826 // Recurse into extract_subvector src vector to find scalars.
6827 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6828 SDValue Src = Op.getOperand(0);
6829 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6830 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6831 }
6832
6833 // We only peek through bitcasts of the same vector width.
6834 if (Opcode == ISD::BITCAST) {
6835 SDValue Src = Op.getOperand(0);
6836 EVT SrcVT = Src.getValueType();
6837 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6838 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6839 return SDValue();
6840 }
6841
6842 // Actual nodes that may contain scalar elements
6843
6844 // For insert_vector_elt - either return the index matching scalar or recurse
6845 // into the base vector.
6846 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6847 isa<ConstantSDNode>(Op.getOperand(2))) {
6848 if (Op.getConstantOperandAPInt(2) == Index)
6849 return Op.getOperand(1);
6850 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6851 }
6852
6853 if (Opcode == ISD::SCALAR_TO_VECTOR)
6854 return (Index == 0) ? Op.getOperand(0)
6855 : DAG.getUNDEF(VT.getVectorElementType());
6856
6857 if (Opcode == ISD::BUILD_VECTOR)
6858 return Op.getOperand(Index);
6859
6860 return SDValue();
6861}
6862
6863// Use PINSRB/PINSRW/PINSRD to create a build vector.
6865 const APInt &NonZeroMask,
6866 unsigned NumNonZero, unsigned NumZero,
6867 SelectionDAG &DAG,
6868 const X86Subtarget &Subtarget) {
6869 MVT VT = Op.getSimpleValueType();
6870 unsigned NumElts = VT.getVectorNumElements();
6871 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6872 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6873 "Illegal vector insertion");
6874
6875 SDValue V;
6876 bool First = true;
6877
6878 for (unsigned i = 0; i < NumElts; ++i) {
6879 bool IsNonZero = NonZeroMask[i];
6880 if (!IsNonZero)
6881 continue;
6882
6883 // If the build vector contains zeros or our first insertion is not the
6884 // first index then insert into zero vector to break any register
6885 // dependency else use SCALAR_TO_VECTOR.
6886 if (First) {
6887 First = false;
6888 if (NumZero || 0 != i)
6889 V = getZeroVector(VT, Subtarget, DAG, DL);
6890 else {
6891 assert(0 == i && "Expected insertion into zero-index");
6892 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6893 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6894 V = DAG.getBitcast(VT, V);
6895 continue;
6896 }
6897 }
6898 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
6899 DAG.getVectorIdxConstant(i, DL));
6900 }
6901
6902 return V;
6903}
6904
6905/// Custom lower build_vector of v16i8.
6907 const APInt &NonZeroMask,
6908 unsigned NumNonZero, unsigned NumZero,
6909 SelectionDAG &DAG,
6910 const X86Subtarget &Subtarget) {
6911 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6912 return SDValue();
6913
6914 // SSE4.1 - use PINSRB to insert each byte directly.
6915 if (Subtarget.hasSSE41())
6916 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
6917 DAG, Subtarget);
6918
6919 SDValue V;
6920
6921 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6922 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6923 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6924 !NonZeroMask.extractBits(2, 2).isZero()) {
6925 for (unsigned I = 0; I != 4; ++I) {
6926 if (!NonZeroMask[I])
6927 continue;
6928 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
6929 if (I != 0)
6930 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
6931 DAG.getConstant(I * 8, DL, MVT::i8));
6932 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
6933 }
6934 assert(V && "Failed to fold v16i8 vector to zero");
6935 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6936 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
6937 V = DAG.getBitcast(MVT::v8i16, V);
6938 }
6939 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6940 bool ThisIsNonZero = NonZeroMask[i];
6941 bool NextIsNonZero = NonZeroMask[i + 1];
6942 if (!ThisIsNonZero && !NextIsNonZero)
6943 continue;
6944
6945 SDValue Elt;
6946 if (ThisIsNonZero) {
6947 if (NumZero || NextIsNonZero)
6948 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6949 else
6950 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6951 }
6952
6953 if (NextIsNonZero) {
6954 SDValue NextElt = Op.getOperand(i + 1);
6955 if (i == 0 && NumZero)
6956 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
6957 else
6958 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
6959 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
6960 DAG.getConstant(8, DL, MVT::i8));
6961 if (ThisIsNonZero)
6962 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
6963 else
6964 Elt = NextElt;
6965 }
6966
6967 // If our first insertion is not the first index or zeros are needed, then
6968 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6969 // elements undefined).
6970 if (!V) {
6971 if (i != 0 || NumZero)
6972 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
6973 else {
6974 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
6975 V = DAG.getBitcast(MVT::v8i16, V);
6976 continue;
6977 }
6978 }
6979 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6980 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
6981 DAG.getVectorIdxConstant(i / 2, DL));
6982 }
6983
6984 return DAG.getBitcast(MVT::v16i8, V);
6985}
6986
6987/// Custom lower build_vector of v8i16.
6989 const APInt &NonZeroMask,
6990 unsigned NumNonZero, unsigned NumZero,
6991 SelectionDAG &DAG,
6992 const X86Subtarget &Subtarget) {
6993 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6994 return SDValue();
6995
6996 // Use PINSRW to insert each byte directly.
6997 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
6998 Subtarget);
6999}
7000
7001/// Custom lower build_vector of v4i32 or v4f32.
7003 SelectionDAG &DAG,
7004 const X86Subtarget &Subtarget) {
7005 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
7006 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
7007 // Because we're creating a less complicated build vector here, we may enable
7008 // further folding of the MOVDDUP via shuffle transforms.
7009 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
7010 Op.getOperand(0) == Op.getOperand(2) &&
7011 Op.getOperand(1) == Op.getOperand(3) &&
7012 Op.getOperand(0) != Op.getOperand(1)) {
7013 MVT VT = Op.getSimpleValueType();
7014 MVT EltVT = VT.getVectorElementType();
7015 // Create a new build vector with the first 2 elements followed by undef
7016 // padding, bitcast to v2f64, duplicate, and bitcast back.
7017 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
7018 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
7019 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
7020 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
7021 return DAG.getBitcast(VT, Dup);
7022 }
7023
7024 // Find all zeroable elements.
7025 std::bitset<4> Zeroable, Undefs;
7026 for (int i = 0; i < 4; ++i) {
7027 SDValue Elt = Op.getOperand(i);
7028 Undefs[i] = Elt.isUndef();
7029 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
7030 }
7031 assert(Zeroable.size() - Zeroable.count() > 1 &&
7032 "We expect at least two non-zero elements!");
7033
7034 // We only know how to deal with build_vector nodes where elements are either
7035 // zeroable or extract_vector_elt with constant index.
7036 SDValue FirstNonZero;
7037 unsigned FirstNonZeroIdx;
7038 for (unsigned i = 0; i < 4; ++i) {
7039 if (Zeroable[i])
7040 continue;
7041 SDValue Elt = Op.getOperand(i);
7042 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7044 return SDValue();
7045 // Make sure that this node is extracting from a 128-bit vector.
7046 MVT VT = Elt.getOperand(0).getSimpleValueType();
7047 if (!VT.is128BitVector())
7048 return SDValue();
7049 if (!FirstNonZero.getNode()) {
7050 FirstNonZero = Elt;
7051 FirstNonZeroIdx = i;
7052 }
7053 }
7054
7055 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
7056 SDValue V1 = FirstNonZero.getOperand(0);
7057 MVT VT = V1.getSimpleValueType();
7058
7059 // See if this build_vector can be lowered as a blend with zero.
7060 SDValue Elt;
7061 unsigned EltMaskIdx, EltIdx;
7062 int Mask[4];
7063 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7064 if (Zeroable[EltIdx]) {
7065 // The zero vector will be on the right hand side.
7066 Mask[EltIdx] = EltIdx+4;
7067 continue;
7068 }
7069
7070 Elt = Op->getOperand(EltIdx);
7071 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
7072 EltMaskIdx = Elt.getConstantOperandVal(1);
7073 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
7074 break;
7075 Mask[EltIdx] = EltIdx;
7076 }
7077
7078 if (EltIdx == 4) {
7079 // Let the shuffle legalizer deal with blend operations.
7080 SDValue VZeroOrUndef = (Zeroable == Undefs)
7081 ? DAG.getUNDEF(VT)
7082 : getZeroVector(VT, Subtarget, DAG, DL);
7083 if (V1.getSimpleValueType() != VT)
7084 V1 = DAG.getBitcast(VT, V1);
7085 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
7086 }
7087
7088 // See if we can lower this build_vector to a INSERTPS.
7089 if (!Subtarget.hasSSE41())
7090 return SDValue();
7091
7092 SDValue V2 = Elt.getOperand(0);
7093 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7094 V1 = SDValue();
7095
7096 bool CanFold = true;
7097 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7098 if (Zeroable[i])
7099 continue;
7100
7101 SDValue Current = Op->getOperand(i);
7102 SDValue SrcVector = Current->getOperand(0);
7103 if (!V1.getNode())
7104 V1 = SrcVector;
7105 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
7106 }
7107
7108 if (!CanFold)
7109 return SDValue();
7110
7111 assert(V1.getNode() && "Expected at least two non-zero elements!");
7112 if (V1.getSimpleValueType() != MVT::v4f32)
7113 V1 = DAG.getBitcast(MVT::v4f32, V1);
7114 if (V2.getSimpleValueType() != MVT::v4f32)
7115 V2 = DAG.getBitcast(MVT::v4f32, V2);
7116
7117 // Ok, we can emit an INSERTPS instruction.
7118 unsigned ZMask = Zeroable.to_ulong();
7119
7120 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
7121 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
7122 SDValue Result =
7123 DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
7124 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
7125 return DAG.getBitcast(VT, Result);
7126}
7127
7128/// Return a vector logical shift node.
7129static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
7130 SelectionDAG &DAG, const TargetLowering &TLI,
7131 const SDLoc &dl) {
7132 assert(VT.is128BitVector() && "Unknown type for VShift");
7133 MVT ShVT = MVT::v16i8;
7134 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7135 SrcOp = DAG.getBitcast(ShVT, SrcOp);
7136 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
7137 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
7138 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
7139}
7140
7142 SelectionDAG &DAG) {
7143
7144 // Check if the scalar load can be widened into a vector load. And if
7145 // the address is "base + cst" see if the cst can be "absorbed" into
7146 // the shuffle mask.
7148 SDValue Ptr = LD->getBasePtr();
7149 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7150 return SDValue();
7151 EVT PVT = LD->getValueType(0);
7152 if (PVT != MVT::i32 && PVT != MVT::f32)
7153 return SDValue();
7154
7155 int FI = -1;
7156 int64_t Offset = 0;
7158 FI = FINode->getIndex();
7159 Offset = 0;
7160 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7161 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
7162 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7163 Offset = Ptr.getConstantOperandVal(1);
7164 Ptr = Ptr.getOperand(0);
7165 } else {
7166 return SDValue();
7167 }
7168
7169 // FIXME: 256-bit vector instructions don't require a strict alignment,
7170 // improve this code to support it better.
7171 Align RequiredAlign(VT.getSizeInBits() / 8);
7172 SDValue Chain = LD->getChain();
7173 // Make sure the stack object alignment is at least 16 or 32.
7175 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
7176 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7177 if (MFI.isFixedObjectIndex(FI)) {
7178 // Can't change the alignment. FIXME: It's possible to compute
7179 // the exact stack offset and reference FI + adjust offset instead.
7180 // If someone *really* cares about this. That's the way to implement it.
7181 return SDValue();
7182 } else {
7183 MFI.setObjectAlignment(FI, RequiredAlign);
7184 }
7185 }
7186
7187 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7188 // Ptr + (Offset & ~15).
7189 if (Offset < 0)
7190 return SDValue();
7191 if ((Offset % RequiredAlign.value()) & 3)
7192 return SDValue();
7193 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7194 if (StartOffset) {
7195 SDLoc DL(Ptr);
7196 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7197 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7198 }
7199
7200 int EltNo = (Offset - StartOffset) >> 2;
7201 unsigned NumElems = VT.getVectorNumElements();
7202
7203 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7204 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7205 LD->getPointerInfo().getWithOffset(StartOffset));
7206
7207 SmallVector<int, 8> Mask(NumElems, EltNo);
7208
7209 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7210 }
7211
7212 return SDValue();
7213}
7214
7215// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7216static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7217 if (ISD::isNON_EXTLoad(Elt.getNode())) {
7218 auto *BaseLd = cast<LoadSDNode>(Elt);
7219 if (!BaseLd->isSimple())
7220 return false;
7221 Ld = BaseLd;
7222 ByteOffset = 0;
7223 return true;
7224 }
7225
7226 switch (Elt.getOpcode()) {
7227 case ISD::BITCAST:
7228 case ISD::TRUNCATE:
7230 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
7231 case ISD::SRL:
7232 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7233 uint64_t Amt = AmtC->getZExtValue();
7234 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
7235 ByteOffset += Amt / 8;
7236 return true;
7237 }
7238 }
7239 break;
7241 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7242 SDValue Src = Elt.getOperand(0);
7243 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7244 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
7245 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7246 findEltLoadSrc(Src, Ld, ByteOffset)) {
7247 uint64_t Idx = IdxC->getZExtValue();
7248 ByteOffset += Idx * (SrcSizeInBits / 8);
7249 return true;
7250 }
7251 }
7252 break;
7253 }
7254
7255 return false;
7256}
7257
7258/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7259/// elements can be replaced by a single large load which has the same value as
7260/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7261///
7262/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7264 const SDLoc &DL, SelectionDAG &DAG,
7265 const X86Subtarget &Subtarget,
7266 bool IsAfterLegalize) {
7267 if ((VT.getScalarSizeInBits() % 8) != 0)
7268 return SDValue();
7269
7270 unsigned NumElems = Elts.size();
7271
7272 int LastLoadedElt = -1;
7273 APInt LoadMask = APInt::getZero(NumElems);
7274 APInt ZeroMask = APInt::getZero(NumElems);
7275 APInt UndefMask = APInt::getZero(NumElems);
7276
7277 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7278 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
7279
7280 // For each element in the initializer, see if we've found a load, zero or an
7281 // undef.
7282 for (unsigned i = 0; i < NumElems; ++i) {
7283 SDValue Elt = peekThroughBitcasts(Elts[i]);
7284 if (!Elt.getNode())
7285 return SDValue();
7286 if (Elt.isUndef()) {
7287 UndefMask.setBit(i);
7288 continue;
7289 }
7291 ZeroMask.setBit(i);
7292 continue;
7293 }
7294
7295 // Each loaded element must be the correct fractional portion of the
7296 // requested vector load.
7297 unsigned EltSizeInBits = Elt.getValueSizeInBits();
7298 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
7299 return SDValue();
7300
7301 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7302 return SDValue();
7303 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7304 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7305 return SDValue();
7306
7307 LoadMask.setBit(i);
7308 LastLoadedElt = i;
7309 }
7310 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
7311 NumElems &&
7312 "Incomplete element masks");
7313
7314 // Handle Special Cases - all undef or undef/zero.
7315 if (UndefMask.popcount() == NumElems)
7316 return DAG.getUNDEF(VT);
7317 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
7318 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7319 : DAG.getConstantFP(0.0, DL, VT);
7320
7321 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7322 int FirstLoadedElt = LoadMask.countr_zero();
7323 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7324 EVT EltBaseVT = EltBase.getValueType();
7325 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
7326 "Register/Memory size mismatch");
7327 LoadSDNode *LDBase = Loads[FirstLoadedElt];
7328 assert(LDBase && "Did not find base load for merging consecutive loads");
7329 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
7330 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7331 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7332 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7333 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7334
7335 // TODO: Support offsetting the base load.
7336 if (ByteOffsets[FirstLoadedElt] != 0)
7337 return SDValue();
7338
7339 // Check to see if the element's load is consecutive to the base load
7340 // or offset from a previous (already checked) load.
7341 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7342 LoadSDNode *Ld = Loads[EltIdx];
7343 int64_t ByteOffset = ByteOffsets[EltIdx];
7344 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7345 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7346 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7347 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7348 }
7349 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
7350 EltIdx - FirstLoadedElt);
7351 };
7352
7353 // Consecutive loads can contain UNDEFS but not ZERO elements.
7354 // Consecutive loads with UNDEFs and ZEROs elements require a
7355 // an additional shuffle stage to clear the ZERO elements.
7356 bool IsConsecutiveLoad = true;
7357 bool IsConsecutiveLoadWithZeros = true;
7358 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7359 if (LoadMask[i]) {
7360 if (!CheckConsecutiveLoad(LDBase, i)) {
7361 IsConsecutiveLoad = false;
7362 IsConsecutiveLoadWithZeros = false;
7363 break;
7364 }
7365 } else if (ZeroMask[i]) {
7366 IsConsecutiveLoad = false;
7367 }
7368 }
7369
7370 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7371 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7372 assert(LDBase->isSimple() &&
7373 "Cannot merge volatile or atomic loads.");
7374 SDValue NewLd =
7375 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7376 LDBase->getPointerInfo(), LDBase->getBaseAlign(), MMOFlags);
7377 for (auto *LD : Loads)
7378 if (LD)
7379 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7380 return NewLd;
7381 };
7382
7383 // Check if the base load is entirely dereferenceable.
7384 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7385 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7386
7387 // LOAD - all consecutive load/undefs (must start/end with a load or be
7388 // entirely dereferenceable). If we have found an entire vector of loads and
7389 // undefs, then return a large load of the entire vector width starting at the
7390 // base pointer. If the vector contains zeros, then attempt to shuffle those
7391 // elements.
7392 if (FirstLoadedElt == 0 &&
7393 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7394 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7395 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7396 return SDValue();
7397
7398 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7399 // will lower to regular temporal loads and use the cache.
7400 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7401 VT.is256BitVector() && !Subtarget.hasInt256())
7402 return SDValue();
7403
7404 if (NumElems == 1)
7405 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7406
7407 if (!ZeroMask)
7408 return CreateLoad(VT, LDBase);
7409
7410 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7411 // vector and a zero vector to clear out the zero elements.
7412 if (!IsAfterLegalize && VT.isVector()) {
7413 unsigned NumMaskElts = VT.getVectorNumElements();
7414 if ((NumMaskElts % NumElems) == 0) {
7415 unsigned Scale = NumMaskElts / NumElems;
7416 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7417 for (unsigned i = 0; i < NumElems; ++i) {
7418 if (UndefMask[i])
7419 continue;
7420 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7421 for (unsigned j = 0; j != Scale; ++j)
7422 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7423 }
7424 SDValue V = CreateLoad(VT, LDBase);
7425 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7426 : DAG.getConstantFP(0.0, DL, VT);
7427 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7428 }
7429 }
7430 }
7431
7432 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7433 if (VT.is256BitVector() || VT.is512BitVector()) {
7434 unsigned HalfNumElems = NumElems / 2;
7435 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7436 EVT HalfVT =
7437 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7438 SDValue HalfLD =
7439 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7440 DAG, Subtarget, IsAfterLegalize);
7441 if (HalfLD)
7442 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7443 HalfLD, DAG.getVectorIdxConstant(0, DL));
7444 }
7445 }
7446
7447 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7448 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7449 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7450 LoadSizeInBits == 64) &&
7451 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7452 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7453 : MVT::getIntegerVT(LoadSizeInBits);
7454 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7455 // Allow v4f32 on SSE1 only targets.
7456 // FIXME: Add more isel patterns so we can just use VT directly.
7457 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7458 VecVT = MVT::v4f32;
7459 if (TLI.isTypeLegal(VecVT)) {
7460 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7461 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7462 SDValue ResNode = DAG.getMemIntrinsicNode(
7463 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7465 for (auto *LD : Loads)
7466 if (LD)
7467 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7468 return DAG.getBitcast(VT, ResNode);
7469 }
7470 }
7471
7472 // BROADCAST - match the smallest possible repetition pattern, load that
7473 // scalar/subvector element and then broadcast to the entire vector.
7474 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7475 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7476 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7477 unsigned RepeatSize = SubElems * BaseSizeInBits;
7478 unsigned ScalarSize = std::min(RepeatSize, 64u);
7479 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7480 continue;
7481
7482 // Don't attempt a 1:N subvector broadcast - it should be caught by
7483 // combineConcatVectorOps, else will cause infinite loops.
7484 if (RepeatSize > ScalarSize && SubElems == 1)
7485 continue;
7486
7487 bool Match = true;
7488 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7489 for (unsigned i = 0; i != NumElems && Match; ++i) {
7490 if (!LoadMask[i])
7491 continue;
7492 SDValue Elt = peekThroughBitcasts(Elts[i]);
7493 if (RepeatedLoads[i % SubElems].isUndef())
7494 RepeatedLoads[i % SubElems] = Elt;
7495 else
7496 Match &= (RepeatedLoads[i % SubElems] == Elt);
7497 }
7498
7499 // We must have loads at both ends of the repetition.
7500 Match &= !RepeatedLoads.front().isUndef();
7501 Match &= !RepeatedLoads.back().isUndef();
7502 if (!Match)
7503 continue;
7504
7505 EVT RepeatVT =
7506 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7507 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7508 : EVT::getFloatingPointVT(ScalarSize);
7509 if (RepeatSize > ScalarSize)
7510 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7511 RepeatSize / ScalarSize);
7512 EVT BroadcastVT =
7513 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7514 VT.getSizeInBits() / ScalarSize);
7515 if (TLI.isTypeLegal(BroadcastVT)) {
7516 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7517 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
7518 SDValue Broadcast = RepeatLoad;
7519 if (RepeatSize > ScalarSize) {
7520 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7521 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7522 } else {
7523 if (!Subtarget.hasAVX2() &&
7525 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7526 Subtarget,
7527 /*AssumeSingleUse=*/true))
7528 return SDValue();
7529 Broadcast =
7530 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7531 }
7532 return DAG.getBitcast(VT, Broadcast);
7533 }
7534 }
7535 }
7536 }
7537
7538 return SDValue();
7539}
7540
7541// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7542// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7543// are consecutive, non-overlapping, and in the right order.
7545 SelectionDAG &DAG,
7546 const X86Subtarget &Subtarget,
7547 bool IsAfterLegalize) {
7549 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7550 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7551 Elts.push_back(Elt);
7552 continue;
7553 }
7554 return SDValue();
7555 }
7556 assert(Elts.size() == VT.getVectorNumElements());
7557 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7558 IsAfterLegalize);
7559}
7560
7562 const APInt &Undefs, LLVMContext &C) {
7563 unsigned ScalarSize = VT.getScalarSizeInBits();
7564 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7565
7566 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7567 if (VT.isFloatingPoint()) {
7568 if (ScalarSize == 16)
7569 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7570 if (ScalarSize == 32)
7571 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7572 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7573 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7574 }
7575 return Constant::getIntegerValue(Ty, Val);
7576 };
7577
7578 SmallVector<Constant *, 32> ConstantVec;
7579 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7580 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7581 : getConstantScalar(Bits[I]));
7582
7583 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7584}
7585
7586static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7587 unsigned SplatBitSize, LLVMContext &C) {
7588 unsigned ScalarSize = VT.getScalarSizeInBits();
7589
7590 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7591 if (VT.isFloatingPoint()) {
7592 if (ScalarSize == 16)
7593 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7594 if (ScalarSize == 32)
7595 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7596 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7597 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7598 }
7599 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7600 };
7601
7602 if (ScalarSize == SplatBitSize)
7603 return getConstantScalar(SplatValue);
7604
7605 unsigned NumElm = SplatBitSize / ScalarSize;
7606 SmallVector<Constant *, 32> ConstantVec;
7607 for (unsigned I = 0; I != NumElm; ++I) {
7608 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7609 ConstantVec.push_back(getConstantScalar(Val));
7610 }
7611 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7612}
7613
7615 for (auto *U : N->users()) {
7616 unsigned Opc = U->getOpcode();
7617 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7618 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7619 return false;
7620 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7621 return false;
7622 if (isTargetShuffle(Opc))
7623 return true;
7624 if (Opc == ISD::BITCAST) // Ignore bitcasts
7625 return isFoldableUseOfShuffle(U);
7626 if (N->hasOneUse()) {
7627 // TODO, there may be some general way to know if a SDNode can
7628 // be folded. We now only know whether an MI is foldable.
7629 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7630 return false;
7631 return true;
7632 }
7633 }
7634 return false;
7635}
7636
7637// If the node has a single use by a VSELECT then AVX512 targets may be able to
7638// fold as a predicated instruction.
7639static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget) {
7640 unsigned SizeInBits = V.getValueSizeInBits();
7641 if ((SizeInBits == 512 && Subtarget.hasAVX512()) ||
7642 (SizeInBits >= 128 && Subtarget.hasVLX())) {
7643 if (V.hasOneUse() && V->user_begin()->getOpcode() == ISD::VSELECT &&
7644 V->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
7645 return true;
7646 }
7647 }
7648 return false;
7649}
7650
7651/// Attempt to use the vbroadcast instruction to generate a splat value
7652/// from a splat BUILD_VECTOR which uses:
7653/// a. A single scalar load, or a constant.
7654/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7655///
7656/// The VBROADCAST node is returned when a pattern is found,
7657/// or SDValue() otherwise.
7659 const SDLoc &dl,
7660 const X86Subtarget &Subtarget,
7661 SelectionDAG &DAG) {
7662 // VBROADCAST requires AVX.
7663 // TODO: Splats could be generated for non-AVX CPUs using SSE
7664 // instructions, but there's less potential gain for only 128-bit vectors.
7665 if (!Subtarget.hasAVX())
7666 return SDValue();
7667
7668 MVT VT = BVOp->getSimpleValueType(0);
7669 unsigned NumElts = VT.getVectorNumElements();
7670 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7671 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7672 "Unsupported vector type for broadcast.");
7673
7674 // See if the build vector is a repeating sequence of scalars (inc. splat).
7675 SDValue Ld;
7676 BitVector UndefElements;
7677 SmallVector<SDValue, 16> Sequence;
7678 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7679 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7680 if (Sequence.size() == 1)
7681 Ld = Sequence[0];
7682 }
7683
7684 // Attempt to use VBROADCASTM
7685 // From this pattern:
7686 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7687 // b. t1 = (build_vector t0 t0)
7688 //
7689 // Create (VBROADCASTM v2i1 X)
7690 if (!Sequence.empty() && Subtarget.hasCDI()) {
7691 // If not a splat, are the upper sequence values zeroable?
7692 unsigned SeqLen = Sequence.size();
7693 bool UpperZeroOrUndef =
7694 SeqLen == 1 ||
7695 llvm::all_of(ArrayRef(Sequence).drop_front(),
7696 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
7697 SDValue Op0 = Sequence[0];
7698 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7699 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7700 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7701 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7702 ? Op0.getOperand(0)
7703 : Op0.getOperand(0).getOperand(0);
7704 MVT MaskVT = BOperand.getSimpleValueType();
7705 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7706 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7707 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7708 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7709 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7710 unsigned Scale = 512 / VT.getSizeInBits();
7711 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7712 }
7713 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7714 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7715 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7716 return DAG.getBitcast(VT, Bcst);
7717 }
7718 }
7719 }
7720
7721 unsigned NumUndefElts = UndefElements.count();
7722 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7723 APInt SplatValue, Undef;
7724 unsigned SplatBitSize;
7725 bool HasUndef;
7726 // Check if this is a repeated constant pattern suitable for broadcasting.
7727 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7728 SplatBitSize > VT.getScalarSizeInBits() &&
7729 SplatBitSize < VT.getSizeInBits()) {
7730 // Avoid replacing with broadcast when it's a use of a shuffle
7731 // instruction to preserve the present custom lowering of shuffles.
7732 if (isFoldableUseOfShuffle(BVOp))
7733 return SDValue();
7734 // replace BUILD_VECTOR with broadcast of the repeated constants.
7735 LLVMContext *Ctx = DAG.getContext();
7736 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7737 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7738 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7739 // Load the constant scalar/subvector and broadcast it.
7740 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7741 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7742 SDValue CP = DAG.getConstantPool(C, PVT);
7743 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7744
7745 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7746 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7747 SDValue Ops[] = {DAG.getEntryNode(), CP};
7748 MachinePointerInfo MPI =
7750 SDValue Brdcst =
7752 MPI, Alignment, MachineMemOperand::MOLoad);
7753 return DAG.getBitcast(VT, Brdcst);
7754 }
7755 if (SplatBitSize > 64) {
7756 // Load the vector of constants and broadcast it.
7757 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7758 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7759 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7760 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7761 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7762 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7763 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7764 MachinePointerInfo MPI =
7767 Ops, VVT, MPI, Alignment,
7769 }
7770 }
7771
7772 // If we are moving a scalar into a vector (Ld must be set and all elements
7773 // but 1 are undef) and that operation is not obviously supported by
7774 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7775 // That's better than general shuffling and may eliminate a load to GPR and
7776 // move from scalar to vector register.
7777 if (!Ld || NumElts - NumUndefElts != 1)
7778 return SDValue();
7779 unsigned ScalarSize = Ld.getValueSizeInBits();
7780 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7781 return SDValue();
7782 }
7783
7784 bool ConstSplatVal =
7785 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7786 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7787
7788 // TODO: Handle broadcasts of non-constant sequences.
7789
7790 // Make sure that all of the users of a non-constant load are from the
7791 // BUILD_VECTOR node.
7792 // FIXME: Is the use count needed for non-constant, non-load case?
7793 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7794 return SDValue();
7795
7796 unsigned ScalarSize = Ld.getValueSizeInBits();
7797 bool IsGE256 = (VT.getSizeInBits() >= 256);
7798
7799 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7800 // instruction to save 8 or more bytes of constant pool data.
7801 // TODO: If multiple splats are generated to load the same constant,
7802 // it may be detrimental to overall size. There needs to be a way to detect
7803 // that condition to know if this is truly a size win.
7804 bool OptForSize = DAG.shouldOptForSize();
7805
7806 // Handle broadcasting a single constant scalar from the constant pool
7807 // into a vector.
7808 // On Sandybridge (no AVX2), it is still better to load a constant vector
7809 // from the constant pool and not to broadcast it from a scalar.
7810 // But override that restriction when optimizing for size.
7811 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7812 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7813 EVT CVT = Ld.getValueType();
7814 assert(!CVT.isVector() && "Must not broadcast a vector type");
7815
7816 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7817 // For size optimization, also splat v2f64 and v2i64, and for size opt
7818 // with AVX2, also splat i8 and i16.
7819 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7820 if (ScalarSize == 32 ||
7821 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7822 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
7823 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7824 const Constant *C = nullptr;
7826 C = CI->getConstantIntValue();
7828 C = CF->getConstantFPValue();
7829
7830 assert(C && "Invalid constant type");
7831
7832 SDValue CP =
7834 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7835
7836 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7837 SDValue Ops[] = {DAG.getEntryNode(), CP};
7838 MachinePointerInfo MPI =
7840 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7841 MPI, Alignment, MachineMemOperand::MOLoad);
7842 }
7843 }
7844
7845 // Handle AVX2 in-register broadcasts.
7846 if (!IsLoad && Subtarget.hasInt256() &&
7847 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7848 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7849
7850 // The scalar source must be a normal load.
7851 if (!IsLoad)
7852 return SDValue();
7853
7854 // Make sure the non-chain result is only used by this build vector.
7855 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7856 return SDValue();
7857
7858 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7859 (Subtarget.hasVLX() && ScalarSize == 64)) {
7860 auto *LN = cast<LoadSDNode>(Ld);
7861 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7862 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7863 SDValue BCast =
7865 LN->getMemoryVT(), LN->getMemOperand());
7866 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7867 return BCast;
7868 }
7869
7870 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7871 // double since there is no vbroadcastsd xmm
7872 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7873 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7874 auto *LN = cast<LoadSDNode>(Ld);
7875 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7876 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7877 SDValue BCast =
7879 LN->getMemoryVT(), LN->getMemOperand());
7880 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7881 return BCast;
7882 }
7883
7884 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7885 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7886
7887 // Unsupported broadcast.
7888 return SDValue();
7889}
7890
7891/// For an EXTRACT_VECTOR_ELT with a constant index return the real
7892/// underlying vector and index.
7893///
7894/// Modifies \p ExtractedFromVec to the real vector and returns the real
7895/// index.
7896static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7897 SDValue ExtIdx) {
7898 int Idx = ExtIdx->getAsZExtVal();
7899 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7900 return Idx;
7901
7902 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7903 // lowered this:
7904 // (extract_vector_elt (v8f32 %1), Constant<6>)
7905 // to:
7906 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7907 // (extract_subvector (v8f32 %0), Constant<4>),
7908 // undef)
7909 // Constant<0>)
7910 // In this case the vector is the extract_subvector expression and the index
7911 // is 2, as specified by the shuffle.
7912 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7913 SDValue ShuffleVec = SVOp->getOperand(0);
7914 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7915 assert(ShuffleVecVT.getVectorElementType() ==
7916 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7917
7918 int ShuffleIdx = SVOp->getMaskElt(Idx);
7919 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7920 ExtractedFromVec = ShuffleVec;
7921 return ShuffleIdx;
7922 }
7923 return Idx;
7924}
7925
7927 SelectionDAG &DAG) {
7928 MVT VT = Op.getSimpleValueType();
7929
7930 // Skip if insert_vec_elt is not supported.
7931 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7933 return SDValue();
7934
7935 unsigned NumElems = Op.getNumOperands();
7936 SDValue VecIn1;
7937 SDValue VecIn2;
7938 SmallVector<unsigned, 4> InsertIndices;
7939 SmallVector<int, 8> Mask(NumElems, -1);
7940
7941 for (unsigned i = 0; i != NumElems; ++i) {
7942 unsigned Opc = Op.getOperand(i).getOpcode();
7943
7944 if (Opc == ISD::UNDEF)
7945 continue;
7946
7948 // Quit if more than 1 elements need inserting.
7949 if (InsertIndices.size() > 1)
7950 return SDValue();
7951
7952 InsertIndices.push_back(i);
7953 continue;
7954 }
7955
7956 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7957 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7958
7959 // Quit if non-constant index.
7960 if (!isa<ConstantSDNode>(ExtIdx))
7961 return SDValue();
7962 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7963
7964 // Quit if extracted from vector of different type.
7965 if (ExtractedFromVec.getValueType() != VT)
7966 return SDValue();
7967
7968 if (!VecIn1.getNode())
7969 VecIn1 = ExtractedFromVec;
7970 else if (VecIn1 != ExtractedFromVec) {
7971 if (!VecIn2.getNode())
7972 VecIn2 = ExtractedFromVec;
7973 else if (VecIn2 != ExtractedFromVec)
7974 // Quit if more than 2 vectors to shuffle
7975 return SDValue();
7976 }
7977
7978 if (ExtractedFromVec == VecIn1)
7979 Mask[i] = Idx;
7980 else if (ExtractedFromVec == VecIn2)
7981 Mask[i] = Idx + NumElems;
7982 }
7983
7984 if (!VecIn1.getNode())
7985 return SDValue();
7986
7987 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7988 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7989
7990 for (unsigned Idx : InsertIndices)
7991 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7992 DAG.getVectorIdxConstant(Idx, DL));
7993
7994 return NV;
7995}
7996
7997// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7999 const X86Subtarget &Subtarget) {
8000 MVT VT = Op.getSimpleValueType();
8001 MVT IVT =
8002 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
8004 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
8005 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
8006 Op.getOperand(I)));
8007 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
8008 return DAG.getBitcast(VT, Res);
8009}
8010
8011// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
8013 SelectionDAG &DAG,
8014 const X86Subtarget &Subtarget) {
8015
8016 MVT VT = Op.getSimpleValueType();
8017 assert((VT.getVectorElementType() == MVT::i1) &&
8018 "Unexpected type in LowerBUILD_VECTORvXi1!");
8019 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
8020 ISD::isBuildVectorAllOnes(Op.getNode()))
8021 return Op;
8022
8023 uint64_t Immediate = 0;
8024 SmallVector<unsigned, 16> NonConstIdx;
8025 bool IsSplat = true;
8026 bool HasConstElts = false;
8027 int SplatIdx = -1;
8028 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
8029 SDValue In = Op.getOperand(idx);
8030 if (In.isUndef())
8031 continue;
8032 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
8033 Immediate |= (InC->getZExtValue() & 0x1) << idx;
8034 HasConstElts = true;
8035 } else {
8036 NonConstIdx.push_back(idx);
8037 }
8038 if (SplatIdx < 0)
8039 SplatIdx = idx;
8040 else if (In != Op.getOperand(SplatIdx))
8041 IsSplat = false;
8042 }
8043
8044 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
8045 if (IsSplat) {
8046 // The build_vector allows the scalar element to be larger than the vector
8047 // element type. We need to mask it to use as a condition unless we know
8048 // the upper bits are zero.
8049 // FIXME: Use computeKnownBits instead of checking specific opcode?
8050 SDValue Cond = Op.getOperand(SplatIdx);
8051 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
8052 if (Cond.getOpcode() != ISD::SETCC)
8053 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
8054 DAG.getConstant(1, dl, MVT::i8));
8055
8056 // Perform the select in the scalar domain so we can use cmov.
8057 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8058 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
8059 DAG.getAllOnesConstant(dl, MVT::i32),
8060 DAG.getConstant(0, dl, MVT::i32));
8061 Select = DAG.getBitcast(MVT::v32i1, Select);
8062 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
8063 } else {
8064 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8065 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
8066 DAG.getAllOnesConstant(dl, ImmVT),
8067 DAG.getConstant(0, dl, ImmVT));
8068 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8069 Select = DAG.getBitcast(VecVT, Select);
8070 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
8071 DAG.getVectorIdxConstant(0, dl));
8072 }
8073 }
8074
8075 // insert elements one by one
8076 SDValue DstVec;
8077 if (HasConstElts) {
8078 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8079 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
8080 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
8081 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
8082 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
8083 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
8084 } else {
8085 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8086 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
8087 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8088 DstVec = DAG.getBitcast(VecVT, Imm);
8089 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
8090 DAG.getVectorIdxConstant(0, dl));
8091 }
8092 } else
8093 DstVec = DAG.getUNDEF(VT);
8094
8095 for (unsigned InsertIdx : NonConstIdx) {
8096 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
8097 Op.getOperand(InsertIdx),
8098 DAG.getVectorIdxConstant(InsertIdx, dl));
8099 }
8100 return DstVec;
8101}
8102
8103LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
8104 switch (Opcode) {
8105 case X86ISD::PACKSS:
8106 case X86ISD::PACKUS:
8107 case X86ISD::FHADD:
8108 case X86ISD::FHSUB:
8109 case X86ISD::HADD:
8110 case X86ISD::HSUB:
8111 return true;
8112 }
8113 return false;
8114}
8115
8116/// This is a helper function of LowerToHorizontalOp().
8117/// This function checks that the build_vector \p N in input implements a
8118/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
8119/// may not match the layout of an x86 256-bit horizontal instruction.
8120/// In other words, if this returns true, then some extraction/insertion will
8121/// be required to produce a valid horizontal instruction.
8122///
8123/// Parameter \p Opcode defines the kind of horizontal operation to match.
8124/// For example, if \p Opcode is equal to ISD::ADD, then this function
8125/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
8126/// is equal to ISD::SUB, then this function checks if this is a horizontal
8127/// arithmetic sub.
8128///
8129/// This function only analyzes elements of \p N whose indices are
8130/// in range [BaseIdx, LastIdx).
8131///
8132/// TODO: This function was originally used to match both real and fake partial
8133/// horizontal operations, but the index-matching logic is incorrect for that.
8134/// See the corrected implementation in isHopBuildVector(). Can we reduce this
8135/// code because it is only used for partial h-op matching now?
8136static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
8137 const SDLoc &DL, SelectionDAG &DAG,
8138 unsigned BaseIdx, unsigned LastIdx,
8139 SDValue &V0, SDValue &V1) {
8140 EVT VT = N->getValueType(0);
8141 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
8142 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
8143 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
8144 "Invalid Vector in input!");
8145
8146 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
8147 bool CanFold = true;
8148 unsigned ExpectedVExtractIdx = BaseIdx;
8149 unsigned NumElts = LastIdx - BaseIdx;
8150 V0 = DAG.getUNDEF(VT);
8151 V1 = DAG.getUNDEF(VT);
8152
8153 // Check if N implements a horizontal binop.
8154 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8155 SDValue Op = N->getOperand(i + BaseIdx);
8156
8157 // Skip UNDEFs.
8158 if (Op->isUndef()) {
8159 // Update the expected vector extract index.
8160 if (i * 2 == NumElts)
8161 ExpectedVExtractIdx = BaseIdx;
8162 ExpectedVExtractIdx += 2;
8163 continue;
8164 }
8165
8166 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8167
8168 if (!CanFold)
8169 break;
8170
8171 SDValue Op0 = Op.getOperand(0);
8172 SDValue Op1 = Op.getOperand(1);
8173
8174 // Try to match the following pattern:
8175 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8176 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8178 Op0.getOperand(0) == Op1.getOperand(0) &&
8181 if (!CanFold)
8182 break;
8183
8184 unsigned I0 = Op0.getConstantOperandVal(1);
8185 unsigned I1 = Op1.getConstantOperandVal(1);
8186
8187 if (i * 2 < NumElts) {
8188 if (V0.isUndef()) {
8189 V0 = Op0.getOperand(0);
8190 if (V0.getValueType() != VT)
8191 return false;
8192 }
8193 } else {
8194 if (V1.isUndef()) {
8195 V1 = Op0.getOperand(0);
8196 if (V1.getValueType() != VT)
8197 return false;
8198 }
8199 if (i * 2 == NumElts)
8200 ExpectedVExtractIdx = BaseIdx;
8201 }
8202
8203 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8204 if (I0 == ExpectedVExtractIdx)
8205 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8206 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8207 // Try to match the following dag sequence:
8208 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8209 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8210 } else
8211 CanFold = false;
8212
8213 ExpectedVExtractIdx += 2;
8214 }
8215
8216 return CanFold;
8217}
8218
8219/// Emit a sequence of two 128-bit horizontal add/sub followed by
8220/// a concat_vector.
8221///
8222/// This is a helper function of LowerToHorizontalOp().
8223/// This function expects two 256-bit vectors called V0 and V1.
8224/// At first, each vector is split into two separate 128-bit vectors.
8225/// Then, the resulting 128-bit vectors are used to implement two
8226/// horizontal binary operations.
8227///
8228/// The kind of horizontal binary operation is defined by \p X86Opcode.
8229///
8230/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8231/// the two new horizontal binop.
8232/// When Mode is set, the first horizontal binop dag node would take as input
8233/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8234/// horizontal binop dag node would take as input the lower 128-bit of V1
8235/// and the upper 128-bit of V1.
8236/// Example:
8237/// HADD V0_LO, V0_HI
8238/// HADD V1_LO, V1_HI
8239///
8240/// Otherwise, the first horizontal binop dag node takes as input the lower
8241/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8242/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8243/// Example:
8244/// HADD V0_LO, V1_LO
8245/// HADD V0_HI, V1_HI
8246///
8247/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
8248/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8249/// the upper 128-bits of the result.
8250static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
8251 const SDLoc &DL, SelectionDAG &DAG,
8252 unsigned X86Opcode, bool Mode,
8253 bool isUndefLO, bool isUndefHI) {
8254 MVT VT = V0.getSimpleValueType();
8255 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
8256 "Invalid nodes in input!");
8257
8258 unsigned NumElts = VT.getVectorNumElements();
8259 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8260 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8261 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8262 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8263 MVT NewVT = V0_LO.getSimpleValueType();
8264
8265 SDValue LO = DAG.getUNDEF(NewVT);
8266 SDValue HI = DAG.getUNDEF(NewVT);
8267
8268 if (Mode) {
8269 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8270 if (!isUndefLO && !V0->isUndef())
8271 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8272 if (!isUndefHI && !V1->isUndef())
8273 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8274 } else {
8275 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8276 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8277 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8278
8279 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8280 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8281 }
8282
8283 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8284}
8285
8286/// Returns true iff \p BV builds a vector with the result equivalent to
8287/// the result of ADDSUB/SUBADD operation.
8288/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8289/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8290/// \p Opnd0 and \p Opnd1.
8292 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8293 SDValue &Opnd0, SDValue &Opnd1,
8294 unsigned &NumExtracts, bool &IsSubAdd,
8295 bool &HasAllowContract) {
8296 using namespace SDPatternMatch;
8297
8298 MVT VT = BV->getSimpleValueType(0);
8299 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8300 return false;
8301
8302 unsigned NumElts = VT.getVectorNumElements();
8303 SDValue InVec0 = DAG.getUNDEF(VT);
8304 SDValue InVec1 = DAG.getUNDEF(VT);
8305
8306 NumExtracts = 0;
8307 HasAllowContract = NumElts != 0;
8308
8309 // Odd-numbered elements in the input build vector are obtained from
8310 // adding/subtracting two integer/float elements.
8311 // Even-numbered elements in the input build vector are obtained from
8312 // subtracting/adding two integer/float elements.
8313 unsigned Opc[2] = {0, 0};
8314 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8315 SDValue Op = BV->getOperand(i);
8316
8317 // Skip 'undef' values.
8318 unsigned Opcode = Op.getOpcode();
8319 if (Opcode == ISD::UNDEF)
8320 continue;
8321
8322 // Early exit if we found an unexpected opcode.
8323 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8324 return false;
8325
8326 SDValue Op0 = Op.getOperand(0);
8327 SDValue Op1 = Op.getOperand(1);
8328
8329 // Try to match the following pattern:
8330 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8331 // Early exit if we cannot match that sequence.
8332 if (!sd_match(Op0, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))) ||
8333 !sd_match(Op1, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))))
8334 return false;
8335
8336 // We found a valid add/sub node, make sure its the same opcode as previous
8337 // elements for this parity.
8338 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8339 return false;
8340 Opc[i % 2] = Opcode;
8341
8342 // Update InVec0 and InVec1.
8343 if (InVec0.isUndef())
8344 InVec0 = Op0.getOperand(0);
8345 if (InVec1.isUndef())
8346 InVec1 = Op1.getOperand(0);
8347
8348 // Make sure that operands in input to each add/sub node always
8349 // come from a same pair of vectors.
8350 if (InVec0 != Op0.getOperand(0)) {
8351 if (Opcode == ISD::FSUB)
8352 return false;
8353
8354 // FADD is commutable. Try to commute the operands
8355 // and then test again.
8356 std::swap(Op0, Op1);
8357 if (InVec0 != Op0.getOperand(0))
8358 return false;
8359 }
8360
8361 if (InVec1 != Op1.getOperand(0))
8362 return false;
8363
8364 // Increment the number of extractions done.
8365 ++NumExtracts;
8366 HasAllowContract &= Op->getFlags().hasAllowContract();
8367 }
8368
8369 // Ensure we have found an opcode for both parities and that they are
8370 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8371 // inputs are undef.
8372 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8373 InVec0.isUndef() || InVec1.isUndef())
8374 return false;
8375
8376 IsSubAdd = Opc[0] == ISD::FADD;
8377
8378 Opnd0 = InVec0;
8379 Opnd1 = InVec1;
8380 return true;
8381}
8382
8383/// Returns true if is possible to fold MUL and an idiom that has already been
8384/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8385/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8386/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8387///
8388/// Prior to calling this function it should be known that there is some
8389/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8390/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8391/// before replacement of such SDNode with ADDSUB operation. Thus the number
8392/// of \p Opnd0 uses is expected to be equal to 2.
8393/// For example, this function may be called for the following IR:
8394/// %AB = fmul fast <2 x double> %A, %B
8395/// %Sub = fsub fast <2 x double> %AB, %C
8396/// %Add = fadd fast <2 x double> %AB, %C
8397/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8398/// <2 x i32> <i32 0, i32 3>
8399/// There is a def for %Addsub here, which potentially can be replaced by
8400/// X86ISD::ADDSUB operation:
8401/// %Addsub = X86ISD::ADDSUB %AB, %C
8402/// and such ADDSUB can further be replaced with FMADDSUB:
8403/// %Addsub = FMADDSUB %A, %B, %C.
8404///
8405/// The main reason why this method is called before the replacement of the
8406/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8407/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8408/// FMADDSUB is.
8409static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8410 SelectionDAG &DAG, SDValue &Opnd0,
8411 SDValue &Opnd1, SDValue &Opnd2,
8412 unsigned ExpectedUses,
8413 bool AllowSubAddOrAddSubContract) {
8414 if (Opnd0.getOpcode() != ISD::FMUL ||
8415 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8416 return false;
8417
8418 // FIXME: These checks must match the similar ones in
8419 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8420 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8421 // or MUL + ADDSUB to FMADDSUB.
8422 const TargetOptions &Options = DAG.getTarget().Options;
8423 bool AllowFusion =
8424 Options.AllowFPOpFusion == FPOpFusion::Fast ||
8425 (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract());
8426 if (!AllowFusion)
8427 return false;
8428
8429 Opnd2 = Opnd1;
8430 Opnd1 = Opnd0.getOperand(1);
8431 Opnd0 = Opnd0.getOperand(0);
8432
8433 return true;
8434}
8435
8436/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8437/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8438/// X86ISD::FMSUBADD node.
8440 const SDLoc &DL,
8441 const X86Subtarget &Subtarget,
8442 SelectionDAG &DAG) {
8443 SDValue Opnd0, Opnd1;
8444 unsigned NumExtracts;
8445 bool IsSubAdd;
8446 bool HasAllowContract;
8447 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd,
8448 HasAllowContract))
8449 return SDValue();
8450
8451 MVT VT = BV->getSimpleValueType(0);
8452
8453 // Try to generate X86ISD::FMADDSUB node here.
8454 SDValue Opnd2;
8455 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts,
8456 HasAllowContract)) {
8457 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8458 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8459 }
8460
8461 // We only support ADDSUB.
8462 if (IsSubAdd)
8463 return SDValue();
8464
8465 // There are no known X86 targets with 512-bit ADDSUB instructions!
8466 // Convert to blend(fsub,fadd).
8467 if (VT.is512BitVector()) {
8468 SmallVector<int> Mask;
8469 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8470 Mask.push_back(I);
8471 Mask.push_back(I + E + 1);
8472 }
8473 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8474 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8475 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8476 }
8477
8478 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8479}
8480
8482 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8483 // Initialize outputs to known values.
8484 MVT VT = BV->getSimpleValueType(0);
8485 HOpcode = ISD::DELETED_NODE;
8486 V0 = DAG.getUNDEF(VT);
8487 V1 = DAG.getUNDEF(VT);
8488
8489 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8490 // half of the result is calculated independently from the 128-bit halves of
8491 // the inputs, so that makes the index-checking logic below more complicated.
8492 unsigned NumElts = VT.getVectorNumElements();
8493 unsigned GenericOpcode = ISD::DELETED_NODE;
8494 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8495 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8496 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8497 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8498 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8499 // Ignore undef elements.
8500 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8501 if (Op.isUndef())
8502 continue;
8503
8504 // If there's an opcode mismatch, we're done.
8505 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8506 return false;
8507
8508 // Initialize horizontal opcode.
8509 if (HOpcode == ISD::DELETED_NODE) {
8510 GenericOpcode = Op.getOpcode();
8511 switch (GenericOpcode) {
8512 // clang-format off
8513 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8514 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8515 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8516 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8517 default: return false;
8518 // clang-format on
8519 }
8520 }
8521
8522 SDValue Op0 = Op.getOperand(0);
8523 SDValue Op1 = Op.getOperand(1);
8524 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8526 Op0.getOperand(0) != Op1.getOperand(0) ||
8528 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8529 return false;
8530
8531 // The source vector is chosen based on which 64-bit half of the
8532 // destination vector is being calculated.
8533 if (j < NumEltsIn64Bits) {
8534 if (V0.isUndef())
8535 V0 = Op0.getOperand(0);
8536 } else {
8537 if (V1.isUndef())
8538 V1 = Op0.getOperand(0);
8539 }
8540
8541 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8542 if (SourceVec != Op0.getOperand(0))
8543 return false;
8544
8545 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8546 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8547 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8548 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8549 (j % NumEltsIn64Bits) * 2;
8550 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8551 continue;
8552
8553 // If this is not a commutative op, this does not match.
8554 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8555 return false;
8556
8557 // Addition is commutative, so try swapping the extract indexes.
8558 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8559 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8560 continue;
8561
8562 // Extract indexes do not match horizontal requirement.
8563 return false;
8564 }
8565 }
8566 // We matched. Opcode and operands are returned by reference as arguments.
8567 return true;
8568}
8569
8571 const SDLoc &DL, SelectionDAG &DAG,
8572 unsigned HOpcode, SDValue V0, SDValue V1) {
8573 // If either input vector is not the same size as the build vector,
8574 // extract/insert the low bits to the correct size.
8575 // This is free (examples: zmm --> xmm, xmm --> ymm).
8576 MVT VT = BV->getSimpleValueType(0);
8577 unsigned Width = VT.getSizeInBits();
8578 if (V0.getValueSizeInBits() > Width)
8579 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8580 else if (V0.getValueSizeInBits() < Width)
8581 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8582
8583 if (V1.getValueSizeInBits() > Width)
8584 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8585 else if (V1.getValueSizeInBits() < Width)
8586 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8587
8588 unsigned NumElts = VT.getVectorNumElements();
8589 APInt DemandedElts = APInt::getAllOnes(NumElts);
8590 for (unsigned i = 0; i != NumElts; ++i)
8591 if (BV->getOperand(i).isUndef())
8592 DemandedElts.clearBit(i);
8593
8594 // If we don't need the upper xmm, then perform as a xmm hop.
8595 unsigned HalfNumElts = NumElts / 2;
8596 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8597 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8598 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8599 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8600 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8601 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8602 }
8603
8604 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8605}
8606
8607/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8609 const X86Subtarget &Subtarget,
8610 SelectionDAG &DAG) {
8611 // We need at least 2 non-undef elements to make this worthwhile by default.
8612 unsigned NumNonUndefs =
8613 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8614 if (NumNonUndefs < 2)
8615 return SDValue();
8616
8617 // There are 4 sets of horizontal math operations distinguished by type:
8618 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8619 // subtarget feature. Try to match those "native" patterns first.
8620 MVT VT = BV->getSimpleValueType(0);
8621 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8622 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8623 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8624 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8625 unsigned HOpcode;
8626 SDValue V0, V1;
8627 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8628 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8629 }
8630
8631 // Try harder to match 256-bit ops by using extract/concat.
8632 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8633 return SDValue();
8634
8635 // Count the number of UNDEF operands in the build_vector in input.
8636 unsigned NumElts = VT.getVectorNumElements();
8637 unsigned Half = NumElts / 2;
8638 unsigned NumUndefsLO = 0;
8639 unsigned NumUndefsHI = 0;
8640 for (unsigned i = 0, e = Half; i != e; ++i)
8641 if (BV->getOperand(i)->isUndef())
8642 NumUndefsLO++;
8643
8644 for (unsigned i = Half, e = NumElts; i != e; ++i)
8645 if (BV->getOperand(i)->isUndef())
8646 NumUndefsHI++;
8647
8648 SDValue InVec0, InVec1;
8649 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8650 SDValue InVec2, InVec3;
8651 unsigned X86Opcode;
8652 bool CanFold = true;
8653
8654 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8655 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8656 InVec3) &&
8657 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8658 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8659 X86Opcode = X86ISD::HADD;
8660 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8661 InVec1) &&
8662 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8663 InVec3) &&
8664 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8665 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8666 X86Opcode = X86ISD::HSUB;
8667 else
8668 CanFold = false;
8669
8670 if (CanFold) {
8671 // Do not try to expand this build_vector into a pair of horizontal
8672 // add/sub if we can emit a pair of scalar add/sub.
8673 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8674 return SDValue();
8675
8676 // Convert this build_vector into a pair of horizontal binops followed by
8677 // a concat vector. We must adjust the outputs from the partial horizontal
8678 // matching calls above to account for undefined vector halves.
8679 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8680 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8681 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8682 bool isUndefLO = NumUndefsLO == Half;
8683 bool isUndefHI = NumUndefsHI == Half;
8684 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8685 isUndefHI);
8686 }
8687 }
8688
8689 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8690 VT == MVT::v16i16) {
8691 unsigned X86Opcode;
8692 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8693 InVec1))
8694 X86Opcode = X86ISD::HADD;
8695 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8696 InVec1))
8697 X86Opcode = X86ISD::HSUB;
8698 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8699 InVec1))
8700 X86Opcode = X86ISD::FHADD;
8701 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8702 InVec1))
8703 X86Opcode = X86ISD::FHSUB;
8704 else
8705 return SDValue();
8706
8707 // Don't try to expand this build_vector into a pair of horizontal add/sub
8708 // if we can simply emit a pair of scalar add/sub.
8709 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8710 return SDValue();
8711
8712 // Convert this build_vector into two horizontal add/sub followed by
8713 // a concat vector.
8714 bool isUndefLO = NumUndefsLO == Half;
8715 bool isUndefHI = NumUndefsHI == Half;
8716 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8717 isUndefLO, isUndefHI);
8718 }
8719
8720 return SDValue();
8721}
8722
8723static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8724 SelectionDAG &DAG);
8725
8726/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8727/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8728/// just apply the bit to the vectors.
8729/// NOTE: Its not in our interest to start make a general purpose vectorizer
8730/// from this, but enough scalar bit operations are created from the later
8731/// legalization + scalarization stages to need basic support.
8733 const X86Subtarget &Subtarget,
8734 SelectionDAG &DAG) {
8735 MVT VT = Op->getSimpleValueType(0);
8736 unsigned NumElems = VT.getVectorNumElements();
8737 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8738
8739 // Check that all elements have the same opcode.
8740 // TODO: Should we allow UNDEFS and if so how many?
8741 unsigned Opcode = Op->getOperand(0).getOpcode();
8742 for (unsigned i = 1; i < NumElems; ++i)
8743 if (Opcode != Op->getOperand(i).getOpcode())
8744 return SDValue();
8745
8746 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8747 bool IsShift = false;
8748 switch (Opcode) {
8749 default:
8750 return SDValue();
8751 case ISD::SHL:
8752 case ISD::SRL:
8753 case ISD::SRA:
8754 IsShift = true;
8755 break;
8756 case ISD::AND:
8757 case ISD::XOR:
8758 case ISD::OR:
8759 // Don't do this if the buildvector is a splat - we'd replace one
8760 // constant with an entire vector.
8761 if (Op->getSplatValue())
8762 return SDValue();
8763 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8764 return SDValue();
8765 break;
8766 }
8767
8768 SmallVector<SDValue, 4> LHSElts, RHSElts;
8769 for (SDValue Elt : Op->ops()) {
8770 SDValue LHS = Elt.getOperand(0);
8771 SDValue RHS = Elt.getOperand(1);
8772
8773 // We expect the canonicalized RHS operand to be the constant.
8775 return SDValue();
8776
8777 // Extend shift amounts.
8778 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8779 if (!IsShift)
8780 return SDValue();
8781 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8782 }
8783
8784 LHSElts.push_back(LHS);
8785 RHSElts.push_back(RHS);
8786 }
8787
8788 // Limit to shifts by uniform immediates.
8789 // TODO: Only accept vXi8/vXi64 special cases?
8790 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8791 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8792 return SDValue();
8793
8794 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8795 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8796 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8797
8798 if (!IsShift)
8799 return Res;
8800
8801 // Immediately lower the shift to ensure the constant build vector doesn't
8802 // get converted to a constant pool before the shift is lowered.
8803 return LowerShift(Res, Subtarget, DAG);
8804}
8805
8806static bool isShuffleFoldableLoad(SDValue);
8807
8808/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats
8809/// representing a blend.
8811 X86Subtarget const &Subtarget,
8812 SelectionDAG &DAG) {
8813 MVT VT = BVOp->getSimpleValueType(0u);
8814
8815 if (VT != MVT::v4f64)
8816 return SDValue();
8817
8818 // Collect unique operands.
8819 auto UniqueOps = SmallSet<SDValue, 16u>();
8820 for (SDValue Op : BVOp->ops()) {
8821 if (isIntOrFPConstant(Op) || Op.isUndef())
8822 return SDValue();
8823 UniqueOps.insert(Op);
8824 }
8825
8826 // Candidate BUILD_VECTOR must have 2 unique operands.
8827 if (UniqueOps.size() != 2u)
8828 return SDValue();
8829
8830 SDValue Op0 = BVOp->getOperand(0u);
8831 UniqueOps.erase(Op0);
8832 SDValue Op1 = *UniqueOps.begin();
8833
8834 if (Subtarget.hasAVX2() || isShuffleFoldableLoad(Op0) ||
8835 isShuffleFoldableLoad(Op1)) {
8836 // Create shuffle mask.
8837 auto const NumElems = VT.getVectorNumElements();
8838 SmallVector<int, 16u> Mask(NumElems);
8839 for (auto I = 0u; I < NumElems; ++I) {
8840 SDValue Op = BVOp->getOperand(I);
8841 Mask[I] = Op == Op0 ? I : I + NumElems;
8842 }
8843 // Create shuffle of splats.
8844 SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
8845 SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1);
8846 return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);
8847 }
8848
8849 return SDValue();
8850}
8851
8852/// Create a vector constant without a load. SSE/AVX provide the bare minimum
8853/// functionality to do this, so it's all zeros, all ones, or some derivation
8854/// that is cheap to calculate.
8856 SelectionDAG &DAG,
8857 const X86Subtarget &Subtarget) {
8858 MVT VT = Op.getSimpleValueType();
8859
8860 // Vectors containing all zeros can be matched by pxor and xorps.
8861 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8862 return Op;
8863
8864 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8865 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8866 // vpcmpeqd on 256-bit vectors.
8867 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8868 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8869 return Op;
8870
8871 return getOnesVector(VT, DAG, DL);
8872 }
8873
8874 return SDValue();
8875}
8876
8877/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8878/// from a vector of source values and a vector of extraction indices.
8879/// The vectors might be manipulated to match the type of the permute op.
8880static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8881 const SDLoc &DL, SelectionDAG &DAG,
8882 const X86Subtarget &Subtarget) {
8883 MVT ShuffleVT = VT;
8884 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8885 unsigned NumElts = VT.getVectorNumElements();
8886 unsigned SizeInBits = VT.getSizeInBits();
8887
8888 // Adjust IndicesVec to match VT size.
8889 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8890 "Illegal variable permute mask size");
8891 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8892 // Narrow/widen the indices vector to the correct size.
8893 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8894 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8895 NumElts * VT.getScalarSizeInBits());
8896 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8897 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8898 SDLoc(IndicesVec), SizeInBits);
8899 // Zero-extend the index elements within the vector.
8900 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8901 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8902 IndicesVT, IndicesVec);
8903 }
8904 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8905
8906 // Handle SrcVec that don't match VT type.
8907 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8908 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8909 // Handle larger SrcVec by treating it as a larger permute.
8910 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8911 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8912 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8913 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8914 Subtarget, DAG, SDLoc(IndicesVec));
8915 SDValue NewSrcVec =
8916 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8917 if (NewSrcVec)
8918 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8919 return SDValue();
8920 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8921 // Widen smaller SrcVec to match VT.
8922 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8923 } else
8924 return SDValue();
8925 }
8926
8927 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8928 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8929 EVT SrcVT = Idx.getValueType();
8930 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8931 uint64_t IndexScale = 0;
8932 uint64_t IndexOffset = 0;
8933
8934 // If we're scaling a smaller permute op, then we need to repeat the
8935 // indices, scaling and offsetting them as well.
8936 // e.g. v4i32 -> v16i8 (Scale = 4)
8937 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8938 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8939 for (uint64_t i = 0; i != Scale; ++i) {
8940 IndexScale |= Scale << (i * NumDstBits);
8941 IndexOffset |= i << (i * NumDstBits);
8942 }
8943
8944 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8945 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8946 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8947 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8948 return Idx;
8949 };
8950
8951 unsigned Opcode = 0;
8952 switch (VT.SimpleTy) {
8953 default:
8954 break;
8955 case MVT::v16i8:
8956 if (Subtarget.hasSSSE3())
8957 Opcode = X86ISD::PSHUFB;
8958 break;
8959 case MVT::v8i16:
8960 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8961 Opcode = X86ISD::VPERMV;
8962 else if (Subtarget.hasSSSE3()) {
8963 Opcode = X86ISD::PSHUFB;
8964 ShuffleVT = MVT::v16i8;
8965 }
8966 break;
8967 case MVT::v4f32:
8968 case MVT::v4i32:
8969 if (Subtarget.hasAVX()) {
8970 Opcode = X86ISD::VPERMILPV;
8971 ShuffleVT = MVT::v4f32;
8972 } else if (Subtarget.hasSSSE3()) {
8973 Opcode = X86ISD::PSHUFB;
8974 ShuffleVT = MVT::v16i8;
8975 }
8976 break;
8977 case MVT::v2f64:
8978 case MVT::v2i64:
8979 if (Subtarget.hasAVX()) {
8980 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8981 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8982 Opcode = X86ISD::VPERMILPV;
8983 ShuffleVT = MVT::v2f64;
8984 } else if (Subtarget.hasSSE41()) {
8985 // SSE41 can compare v2i64 - select between indices 0 and 1.
8986 return DAG.getSelectCC(
8987 DL, IndicesVec,
8988 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8989 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8990 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8992 }
8993 break;
8994 case MVT::v32i8:
8995 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8996 Opcode = X86ISD::VPERMV;
8997 else if (Subtarget.hasXOP()) {
8998 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8999 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
9000 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
9001 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
9002 return DAG.getNode(
9004 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
9005 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
9006 } else if (Subtarget.hasAVX()) {
9007 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
9008 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
9009 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
9010 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
9011 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
9013 // Permute Lo and Hi and then select based on index range.
9014 // This works as SHUFB uses bits[3:0] to permute elements and we don't
9015 // care about the bit[7] as its just an index vector.
9016 SDValue Idx = Ops[2];
9017 EVT VT = Idx.getValueType();
9018 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
9019 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
9020 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
9022 };
9023 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
9024 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
9025 PSHUFBBuilder);
9026 }
9027 break;
9028 case MVT::v16i16:
9029 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9030 Opcode = X86ISD::VPERMV;
9031 else if (Subtarget.hasAVX()) {
9032 // Scale to v32i8 and perform as v32i8.
9033 IndicesVec = ScaleIndices(IndicesVec, 2);
9034 return DAG.getBitcast(
9036 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
9037 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
9038 }
9039 break;
9040 case MVT::v8f32:
9041 case MVT::v8i32:
9042 if (Subtarget.hasAVX2())
9043 Opcode = X86ISD::VPERMV;
9044 else if (Subtarget.hasAVX()) {
9045 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
9046 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9047 {0, 1, 2, 3, 0, 1, 2, 3});
9048 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9049 {4, 5, 6, 7, 4, 5, 6, 7});
9050 if (Subtarget.hasXOP())
9051 return DAG.getBitcast(
9052 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
9053 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9054 // Permute Lo and Hi and then select based on index range.
9055 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
9056 SDValue Res = DAG.getSelectCC(
9057 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
9058 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
9059 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
9061 return DAG.getBitcast(VT, Res);
9062 }
9063 break;
9064 case MVT::v4i64:
9065 case MVT::v4f64:
9066 if (Subtarget.hasAVX512()) {
9067 if (!Subtarget.hasVLX()) {
9068 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
9069 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
9070 SDLoc(SrcVec));
9071 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
9072 DAG, SDLoc(IndicesVec));
9073 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
9074 DAG, Subtarget);
9075 return extract256BitVector(Res, 0, DAG, DL);
9076 }
9077 Opcode = X86ISD::VPERMV;
9078 } else if (Subtarget.hasAVX()) {
9079 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
9080 SDValue LoLo =
9081 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
9082 SDValue HiHi =
9083 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
9084 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
9085 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9086 if (Subtarget.hasXOP())
9087 return DAG.getBitcast(
9088 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
9089 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9090 // Permute Lo and Hi and then select based on index range.
9091 // This works as VPERMILPD only uses index bit[1] to permute elements.
9092 SDValue Res = DAG.getSelectCC(
9093 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
9094 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
9095 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
9097 return DAG.getBitcast(VT, Res);
9098 }
9099 break;
9100 case MVT::v64i8:
9101 if (Subtarget.hasVBMI())
9102 Opcode = X86ISD::VPERMV;
9103 break;
9104 case MVT::v32i16:
9105 if (Subtarget.hasBWI())
9106 Opcode = X86ISD::VPERMV;
9107 break;
9108 case MVT::v16f32:
9109 case MVT::v16i32:
9110 case MVT::v8f64:
9111 case MVT::v8i64:
9112 if (Subtarget.hasAVX512())
9113 Opcode = X86ISD::VPERMV;
9114 break;
9115 }
9116 if (!Opcode)
9117 return SDValue();
9118
9119 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
9120 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
9121 "Illegal variable permute shuffle type");
9122
9123 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
9124 if (Scale > 1)
9125 IndicesVec = ScaleIndices(IndicesVec, Scale);
9126
9127 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
9128 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
9129
9130 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
9131 SDValue Res = Opcode == X86ISD::VPERMV
9132 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
9133 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
9134 return DAG.getBitcast(VT, Res);
9135}
9136
9137// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
9138// reasoned to be a permutation of a vector by indices in a non-constant vector.
9139// (build_vector (extract_elt V, (extract_elt I, 0)),
9140// (extract_elt V, (extract_elt I, 1)),
9141// ...
9142// ->
9143// (vpermv I, V)
9144//
9145// TODO: Handle undefs
9146// TODO: Utilize pshufb and zero mask blending to support more efficient
9147// construction of vectors with constant-0 elements.
9148static SDValue
9150 SelectionDAG &DAG,
9151 const X86Subtarget &Subtarget) {
9152 SDValue SrcVec, IndicesVec;
9153
9154 auto PeekThroughFreeze = [](SDValue N) {
9155 if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())
9156 return N->getOperand(0);
9157 return N;
9158 };
9159 // Check for a match of the permute source vector and permute index elements.
9160 // This is done by checking that the i-th build_vector operand is of the form:
9161 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
9162 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
9163 SDValue Op = PeekThroughFreeze(V.getOperand(Idx));
9164 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9165 return SDValue();
9166
9167 // If this is the first extract encountered in V, set the source vector,
9168 // otherwise verify the extract is from the previously defined source
9169 // vector.
9170 if (!SrcVec)
9171 SrcVec = Op.getOperand(0);
9172 else if (SrcVec != Op.getOperand(0))
9173 return SDValue();
9174 SDValue ExtractedIndex = Op->getOperand(1);
9175 // Peek through extends.
9176 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
9177 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
9178 ExtractedIndex = ExtractedIndex.getOperand(0);
9179 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9180 return SDValue();
9181
9182 // If this is the first extract from the index vector candidate, set the
9183 // indices vector, otherwise verify the extract is from the previously
9184 // defined indices vector.
9185 if (!IndicesVec)
9186 IndicesVec = ExtractedIndex.getOperand(0);
9187 else if (IndicesVec != ExtractedIndex.getOperand(0))
9188 return SDValue();
9189
9190 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
9191 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9192 return SDValue();
9193 }
9194
9195 MVT VT = V.getSimpleValueType();
9196 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9197}
9198
9199SDValue
9200X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
9201 SDLoc dl(Op);
9202
9203 MVT VT = Op.getSimpleValueType();
9204 MVT EltVT = VT.getVectorElementType();
9205 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
9206 unsigned NumElems = Op.getNumOperands();
9207
9208 // Generate vectors for predicate vectors.
9209 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9210 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
9211
9212 if (VT.getVectorElementType() == MVT::bf16 &&
9213 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9214 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
9215
9216 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
9217 return VectorCst;
9218
9219 unsigned EVTBits = EltVT.getSizeInBits();
9220 APInt UndefMask = APInt::getZero(NumElems);
9221 APInt FrozenUndefMask = APInt::getZero(NumElems);
9222 APInt ZeroMask = APInt::getZero(NumElems);
9223 APInt NonZeroMask = APInt::getZero(NumElems);
9224 bool IsAllConstants = true;
9225 bool OneUseFrozenUndefs = true;
9226 SmallSet<SDValue, 8> Values;
9227 unsigned NumConstants = NumElems;
9228 for (unsigned i = 0; i < NumElems; ++i) {
9229 SDValue Elt = Op.getOperand(i);
9230 if (Elt.isUndef()) {
9231 UndefMask.setBit(i);
9232 continue;
9233 }
9234 if (ISD::isFreezeUndef(Elt.getNode())) {
9235 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9236 FrozenUndefMask.setBit(i);
9237 continue;
9238 }
9239 Values.insert(Elt);
9240 if (!isIntOrFPConstant(Elt)) {
9241 IsAllConstants = false;
9242 NumConstants--;
9243 }
9244 if (X86::isZeroNode(Elt)) {
9245 ZeroMask.setBit(i);
9246 } else {
9247 NonZeroMask.setBit(i);
9248 }
9249 }
9250
9251 // All undef vector. Return an UNDEF.
9252 if (UndefMask.isAllOnes())
9253 return DAG.getUNDEF(VT);
9254
9255 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
9256 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9257 return DAG.getFreeze(DAG.getUNDEF(VT));
9258
9259 // All undef/freeze(undef)/zero vector. Return a zero vector.
9260 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9261 return getZeroVector(VT, Subtarget, DAG, dl);
9262
9263 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9264 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
9265 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9266 // and blend the FREEZE-UNDEF operands back in.
9267 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9268 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
9269 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9270 SmallVector<int, 16> BlendMask(NumElems, -1);
9271 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
9272 for (unsigned i = 0; i < NumElems; ++i) {
9273 if (UndefMask[i]) {
9274 BlendMask[i] = -1;
9275 continue;
9276 }
9277 BlendMask[i] = i;
9278 if (!FrozenUndefMask[i])
9279 Elts[i] = Op.getOperand(i);
9280 else
9281 BlendMask[i] += NumElems;
9282 }
9283 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
9284 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
9285 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
9286 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
9287 }
9288
9289 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9290
9291 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
9292 // be better off lowering to a smaller build vector and padding with
9293 // undef/zero.
9294 if ((VT.is256BitVector() || VT.is512BitVector()) &&
9296 unsigned UpperElems = NumElems / 2;
9297 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9298 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
9299 if (NumUpperUndefsOrZeros >= UpperElems) {
9300 if (VT.is512BitVector() &&
9301 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9302 UpperElems = NumElems - (NumElems / 4);
9303 // If freeze(undef) is in any upper elements, force to zero.
9304 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
9305 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9306 SDValue NewBV =
9307 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9308 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9309 }
9310 }
9311
9312 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
9313 return AddSub;
9314 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
9315 return HorizontalOp;
9316 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
9317 return Broadcast;
9318 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
9319 return BitOp;
9320 if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG))
9321 return Blend;
9322
9323 unsigned NumZero = ZeroMask.popcount();
9324 unsigned NumNonZero = NonZeroMask.popcount();
9325
9326 // If we are inserting one variable into a vector of non-zero constants, try
9327 // to avoid loading each constant element as a scalar. Load the constants as a
9328 // vector and then insert the variable scalar element. If insertion is not
9329 // supported, fall back to a shuffle to get the scalar blended with the
9330 // constants. Insertion into a zero vector is handled as a special-case
9331 // somewhere below here.
9332 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9333 FrozenUndefMask.isZero() &&
9336 // Create an all-constant vector. The variable element in the old
9337 // build vector is replaced by undef in the constant vector. Save the
9338 // variable scalar element and its index for use in the insertelement.
9339 LLVMContext &Context = *DAG.getContext();
9340 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9341 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9342 SDValue VarElt;
9343 SDValue InsIndex;
9344 for (unsigned i = 0; i != NumElems; ++i) {
9345 SDValue Elt = Op.getOperand(i);
9346 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9347 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9348 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9349 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9350 else if (!Elt.isUndef()) {
9351 assert(!VarElt.getNode() && !InsIndex.getNode() &&
9352 "Expected one variable element in this vector");
9353 VarElt = Elt;
9354 InsIndex = DAG.getVectorIdxConstant(i, dl);
9355 }
9356 }
9357 Constant *CV = ConstantVector::get(ConstVecOps);
9358 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9359
9360 // The constants we just created may not be legal (eg, floating point). We
9361 // must lower the vector right here because we can not guarantee that we'll
9362 // legalize it before loading it. This is also why we could not just create
9363 // a new build vector here. If the build vector contains illegal constants,
9364 // it could get split back up into a series of insert elements.
9365 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9366 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9367 MachineFunction &MF = DAG.getMachineFunction();
9368 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
9369 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9370 unsigned InsertC = InsIndex->getAsZExtVal();
9371 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9372 if (InsertC < NumEltsInLow128Bits)
9373 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9374
9375 // There's no good way to insert into the high elements of a >128-bit
9376 // vector, so use shuffles to avoid an extract/insert sequence.
9377 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
9378 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9379 SmallVector<int, 8> ShuffleMask;
9380 unsigned NumElts = VT.getVectorNumElements();
9381 for (unsigned i = 0; i != NumElts; ++i)
9382 ShuffleMask.push_back(i == InsertC ? NumElts : i);
9383 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
9384 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
9385 }
9386
9387 // Special case for single non-zero, non-undef, element.
9388 if (NumNonZero == 1) {
9389 unsigned Idx = NonZeroMask.countr_zero();
9390 SDValue Item = Op.getOperand(Idx);
9391
9392 // If we have a constant or non-constant insertion into the low element of
9393 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
9394 // the rest of the elements. This will be matched as movd/movq/movss/movsd
9395 // depending on what the source datatype is.
9396 if (Idx == 0) {
9397 if (NumZero == 0)
9398 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9399
9400 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9401 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9402 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9403 assert((VT.is128BitVector() || VT.is256BitVector() ||
9404 VT.is512BitVector()) &&
9405 "Expected an SSE value type!");
9406 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9407 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
9408 // zero vector.
9409 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9410 }
9411
9412 // We can't directly insert an i8 or i16 into a vector, so zero extend
9413 // it to i32 first.
9414 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9415 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9416 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
9417 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9418 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9419 return DAG.getBitcast(VT, Item);
9420 }
9421 }
9422
9423 // Is it a vector logical left shift?
9424 if (NumElems == 2 && Idx == 1 &&
9425 X86::isZeroNode(Op.getOperand(0)) &&
9426 !X86::isZeroNode(Op.getOperand(1))) {
9427 unsigned NumBits = VT.getSizeInBits();
9428 return getVShift(true, VT,
9430 VT, Op.getOperand(1)),
9431 NumBits/2, DAG, *this, dl);
9432 }
9433
9434 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9435 return SDValue();
9436
9437 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9438 // is a non-constant being inserted into an element other than the low one,
9439 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9440 // movd/movss) to move this into the low element, then shuffle it into
9441 // place.
9442 if (EVTBits == 32) {
9443 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9444 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9445 }
9446 }
9447
9448 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9449 if (Values.size() == 1) {
9450 if (EVTBits == 32) {
9451 // Instead of a shuffle like this:
9452 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9453 // Check if it's possible to issue this instead.
9454 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9455 unsigned Idx = NonZeroMask.countr_zero();
9456 SDValue Item = Op.getOperand(Idx);
9457 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9458 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9459 }
9460 return SDValue();
9461 }
9462
9463 // A vector full of immediates; various special cases are already
9464 // handled, so this is best done with a single constant-pool load.
9465 if (IsAllConstants)
9466 return SDValue();
9467
9468 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9469 return V;
9470
9471 // See if we can use a vector load to get all of the elements.
9472 {
9473 SmallVector<SDValue, 64> Ops(Op->ops().take_front(NumElems));
9474 if (SDValue LD =
9475 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9476 return LD;
9477 }
9478
9479 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9480 // build_vector and broadcast it.
9481 // TODO: We could probably generalize this more.
9482 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9483 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9484 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9485 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9486 // Make sure all the even/odd operands match.
9487 for (unsigned i = 2; i != NumElems; ++i)
9488 if (Ops[i % 2] != Op.getOperand(i))
9489 return false;
9490 return true;
9491 };
9492 if (CanSplat(Op, NumElems, Ops)) {
9493 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9494 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9495 // Create a new build vector and cast to v2i64/v2f64.
9496 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9497 DAG.getBuildVector(NarrowVT, dl, Ops));
9498 // Broadcast from v2i64/v2f64 and cast to final VT.
9499 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9500 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9501 NewBV));
9502 }
9503 }
9504
9505 // For AVX-length vectors, build the individual 128-bit pieces and use
9506 // shuffles to put them in place.
9507 if (VT.getSizeInBits() > 128) {
9508 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9509
9510 // Build both the lower and upper subvector.
9511 SDValue Lower =
9512 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9514 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9515
9516 // Recreate the wider vector with the lower and upper part.
9517 return concatSubVectors(Lower, Upper, DAG, dl);
9518 }
9519
9520 // Let legalizer expand 2-wide build_vectors.
9521 if (EVTBits == 64) {
9522 if (NumNonZero == 1) {
9523 // One half is zero or undef.
9524 unsigned Idx = NonZeroMask.countr_zero();
9525 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
9526 Op.getOperand(Idx));
9527 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9528 }
9529 return SDValue();
9530 }
9531
9532 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9533 if (EVTBits == 8 && NumElems == 16)
9534 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9535 NumZero, DAG, Subtarget))
9536 return V;
9537
9538 if (EltVT == MVT::i16 && NumElems == 8)
9539 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9540 NumZero, DAG, Subtarget))
9541 return V;
9542
9543 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9544 if (EVTBits == 32 && NumElems == 4)
9545 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9546 return V;
9547
9548 // If element VT is == 32 bits, turn it into a number of shuffles.
9549 if (NumElems == 4 && NumZero > 0) {
9550 SmallVector<SDValue, 8> Ops(NumElems);
9551 for (unsigned i = 0; i < 4; ++i) {
9552 bool isZero = !NonZeroMask[i];
9553 if (isZero)
9554 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9555 else
9556 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9557 }
9558
9559 for (unsigned i = 0; i < 2; ++i) {
9560 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9561 default: llvm_unreachable("Unexpected NonZero count");
9562 case 0:
9563 Ops[i] = Ops[i*2]; // Must be a zero vector.
9564 break;
9565 case 1:
9566 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9567 break;
9568 case 2:
9569 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9570 break;
9571 case 3:
9572 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9573 break;
9574 }
9575 }
9576
9577 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9578 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9579 int MaskVec[] = {
9580 Reverse1 ? 1 : 0,
9581 Reverse1 ? 0 : 1,
9582 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9583 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9584 };
9585 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9586 }
9587
9588 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9589
9590 // Check for a build vector from mostly shuffle plus few inserting.
9591 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9592 return Sh;
9593
9594 // For SSE 4.1, use insertps to put the high elements into the low element.
9595 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9597 if (!Op.getOperand(0).isUndef())
9598 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9599 else
9600 Result = DAG.getUNDEF(VT);
9601
9602 for (unsigned i = 1; i < NumElems; ++i) {
9603 if (Op.getOperand(i).isUndef()) continue;
9604 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9605 Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
9606 }
9607 return Result;
9608 }
9609
9610 // Otherwise, expand into a number of unpckl*, start by extending each of
9611 // our (non-undef) elements to the full vector width with the element in the
9612 // bottom slot of the vector (which generates no code for SSE).
9613 SmallVector<SDValue, 8> Ops(NumElems);
9614 for (unsigned i = 0; i < NumElems; ++i) {
9615 if (!Op.getOperand(i).isUndef())
9616 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9617 else
9618 Ops[i] = DAG.getUNDEF(VT);
9619 }
9620
9621 // Next, we iteratively mix elements, e.g. for v4f32:
9622 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9623 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9624 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9625 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9626 // Generate scaled UNPCKL shuffle mask.
9627 SmallVector<int, 16> Mask;
9628 for(unsigned i = 0; i != Scale; ++i)
9629 Mask.push_back(i);
9630 for (unsigned i = 0; i != Scale; ++i)
9631 Mask.push_back(NumElems+i);
9632 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9633
9634 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9635 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9636 }
9637 return Ops[0];
9638}
9639
9640// 256-bit AVX can use the vinsertf128 instruction
9641// to create 256-bit vectors from two other 128-bit ones.
9642// TODO: Detect subvector broadcast here instead of DAG combine?
9644 SelectionDAG &DAG,
9645 const X86Subtarget &Subtarget) {
9646 MVT ResVT = Op.getSimpleValueType();
9647 assert((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
9648 "Value type must be 256-/512-bit wide");
9649
9650 unsigned NumOperands = Op.getNumOperands();
9651 unsigned NumFreezeUndef = 0;
9652 unsigned NumZero = 0;
9653 unsigned NumNonZero = 0;
9654 unsigned NonZeros = 0;
9655 SmallSet<SDValue, 4> Undefs;
9656 for (unsigned i = 0; i != NumOperands; ++i) {
9657 SDValue SubVec = Op.getOperand(i);
9658 if (SubVec.isUndef())
9659 continue;
9660 if (ISD::isFreezeUndef(SubVec.getNode())) {
9661 // If the freeze(undef) has multiple uses then we must fold to zero.
9662 if (SubVec.hasOneUse()) {
9663 ++NumFreezeUndef;
9664 } else {
9665 ++NumZero;
9666 Undefs.insert(SubVec);
9667 }
9668 }
9669 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9670 ++NumZero;
9671 else {
9672 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9673 NonZeros |= 1 << i;
9674 ++NumNonZero;
9675 }
9676 }
9677
9678 // If we have more than 2 non-zeros, build each half separately.
9679 if (NumNonZero > 2) {
9680 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9681 ArrayRef<SDUse> Ops = Op->ops();
9682 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9683 Ops.slice(0, NumOperands/2));
9684 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9685 Ops.slice(NumOperands/2));
9686 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9687 }
9688
9689 // Otherwise, build it up through insert_subvectors.
9690 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9691 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9692 : DAG.getUNDEF(ResVT));
9693
9694 // Replace Undef operands with ZeroVector.
9695 for (SDValue U : Undefs)
9697 U, getZeroVector(U.getSimpleValueType(), Subtarget, DAG, dl));
9698
9699 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9700 unsigned NumSubElems = SubVT.getVectorNumElements();
9701 for (unsigned i = 0; i != NumOperands; ++i) {
9702 if ((NonZeros & (1 << i)) == 0)
9703 continue;
9704
9705 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i),
9706 DAG.getVectorIdxConstant(i * NumSubElems, dl));
9707 }
9708
9709 return Vec;
9710}
9711
9712// Returns true if the given node is a type promotion (by concatenating i1
9713// zeros) of the result of a node that already zeros all upper bits of
9714// k-register.
9715// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9717 const X86Subtarget &Subtarget,
9718 SelectionDAG & DAG) {
9719 MVT ResVT = Op.getSimpleValueType();
9720 unsigned NumOperands = Op.getNumOperands();
9721 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9722 "Unexpected number of operands in CONCAT_VECTORS");
9723
9724 uint64_t Zeros = 0;
9725 uint64_t NonZeros = 0;
9726 for (unsigned i = 0; i != NumOperands; ++i) {
9727 SDValue SubVec = Op.getOperand(i);
9728 if (SubVec.isUndef())
9729 continue;
9730 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9731 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9732 Zeros |= (uint64_t)1 << i;
9733 else
9734 NonZeros |= (uint64_t)1 << i;
9735 }
9736
9737 unsigned NumElems = ResVT.getVectorNumElements();
9738
9739 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9740 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9741 // insert_subvector will give us two kshifts.
9742 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9743 Log2_64(NonZeros) != NumOperands - 1) {
9744 unsigned Idx = Log2_64(NonZeros);
9745 SDValue SubVec = Op.getOperand(Idx);
9746 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9747 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9748 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9749 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9750 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9751 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9752 DAG.getVectorIdxConstant(0, dl));
9753 }
9754
9755 // If there are zero or one non-zeros we can handle this very simply.
9756 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9757 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9758 if (!NonZeros)
9759 return Vec;
9760 unsigned Idx = Log2_64(NonZeros);
9761 SDValue SubVec = Op.getOperand(Idx);
9762 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9763 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9764 DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl));
9765 }
9766
9767 if (NumOperands > 2) {
9768 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9769 ArrayRef<SDUse> Ops = Op->ops();
9770 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9771 Ops.slice(0, NumOperands / 2));
9772 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9773 Ops.slice(NumOperands / 2));
9774 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9775 }
9776
9777 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9778
9779 if (ResVT.getVectorNumElements() >= 16)
9780 return Op; // The operation is legal with KUNPCK
9781
9782 SDValue Vec =
9783 DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT),
9784 Op.getOperand(0), DAG.getVectorIdxConstant(0, dl));
9785 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9786 DAG.getVectorIdxConstant(NumElems / 2, dl));
9787}
9788
9790 const X86Subtarget &Subtarget,
9791 SelectionDAG &DAG) {
9792 SDLoc DL(Op);
9793 MVT VT = Op.getSimpleValueType();
9794 if (VT.getVectorElementType() == MVT::i1)
9795 return LowerCONCAT_VECTORSvXi1(Op, DL, Subtarget, DAG);
9796
9797 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9798 // from two other 128-bit ones.
9799 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9800 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9801 (VT.is512BitVector() &&
9802 (Op.getNumOperands() == 2 || Op.getNumOperands() == 4)));
9803 return LowerAVXCONCAT_VECTORS(Op, DL, DAG, Subtarget);
9804}
9805
9806//===----------------------------------------------------------------------===//
9807// Vector shuffle lowering
9808//
9809// This is an experimental code path for lowering vector shuffles on x86. It is
9810// designed to handle arbitrary vector shuffles and blends, gracefully
9811// degrading performance as necessary. It works hard to recognize idiomatic
9812// shuffles and lower them to optimal instruction patterns without leaving
9813// a framework that allows reasonably efficient handling of all vector shuffle
9814// patterns.
9815//===----------------------------------------------------------------------===//
9816
9817/// Checks whether the vector elements referenced by two shuffle masks are
9818/// equivalent.
9819static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9820 int Idx, int ExpectedIdx) {
9821 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9822 ExpectedIdx < MaskSize && "Out of range element index");
9823 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9824 return false;
9825
9826 EVT VT = Op.getValueType();
9827 EVT ExpectedVT = ExpectedOp.getValueType();
9828
9829 // Sources must be vectors and match the mask's element count.
9830 if (!VT.isVector() || !ExpectedVT.isVector() ||
9831 (int)VT.getVectorNumElements() != MaskSize ||
9832 (int)ExpectedVT.getVectorNumElements() != MaskSize)
9833 return false;
9834
9835 // Exact match.
9836 if (Idx == ExpectedIdx && Op == ExpectedOp)
9837 return true;
9838
9839 switch (Op.getOpcode()) {
9840 case ISD::BUILD_VECTOR:
9841 // If the values are build vectors, we can look through them to find
9842 // equivalent inputs that make the shuffles equivalent.
9843 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9844 case ISD::BITCAST: {
9846 EVT SrcVT = Src.getValueType();
9847 if (Op == ExpectedOp && SrcVT.isVector()) {
9848 if ((SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
9849 unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
9850 return (Idx % Scale) == (ExpectedIdx % Scale) &&
9851 IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9852 Idx / Scale, ExpectedIdx / Scale);
9853 }
9854 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0) {
9855 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
9856 for (unsigned I = 0; I != Scale; ++I)
9857 if (!IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9858 (Idx * Scale) + I,
9859 (ExpectedIdx * Scale) + I))
9860 return false;
9861 return true;
9862 }
9863 }
9864 break;
9865 }
9866 case ISD::VECTOR_SHUFFLE: {
9867 auto *SVN = cast<ShuffleVectorSDNode>(Op);
9868 return Op == ExpectedOp &&
9869 SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
9870 }
9871 case X86ISD::VBROADCAST:
9873 return Op == ExpectedOp;
9875 if (Op == ExpectedOp) {
9876 auto *MemOp = cast<MemSDNode>(Op);
9877 unsigned NumMemElts = MemOp->getMemoryVT().getVectorNumElements();
9878 return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts);
9879 }
9880 break;
9881 case X86ISD::VPERMI: {
9882 if (Op == ExpectedOp) {
9884 DecodeVPERMMask(MaskSize, Op.getConstantOperandVal(1), Mask);
9885 SDValue Src = Op.getOperand(0);
9886 return IsElementEquivalent(MaskSize, Src, Src, Mask[Idx],
9887 Mask[ExpectedIdx]);
9888 }
9889 break;
9890 }
9891 case X86ISD::HADD:
9892 case X86ISD::HSUB:
9893 case X86ISD::FHADD:
9894 case X86ISD::FHSUB:
9895 case X86ISD::PACKSS:
9896 case X86ISD::PACKUS:
9897 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9898 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9899 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9900 int NumElts = VT.getVectorNumElements();
9901 int NumLanes = VT.getSizeInBits() / 128;
9902 int NumEltsPerLane = NumElts / NumLanes;
9903 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9904 bool SameLane = (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9905 bool SameElt =
9906 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9907 return SameLane && SameElt;
9908 }
9909 break;
9910 }
9911
9912 return false;
9913}
9914
9915/// Tiny helper function to identify a no-op mask.
9916///
9917/// This is a somewhat boring predicate function. It checks whether the mask
9918/// array input, which is assumed to be a single-input shuffle mask of the kind
9919/// used by the X86 shuffle instructions (not a fully general
9920/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9921/// in-place shuffle are 'no-op's.
9923 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9924 assert(Mask[i] >= -1 && "Out of bound mask element!");
9925 if (Mask[i] >= 0 && Mask[i] != i)
9926 return false;
9927 }
9928 return true;
9929}
9930
9931/// Test whether there are elements crossing LaneSizeInBits lanes in this
9932/// shuffle mask.
9933///
9934/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9935/// and we routinely test for these.
9936static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9937 unsigned ScalarSizeInBits,
9938 ArrayRef<int> Mask) {
9939 assert(LaneSizeInBits && ScalarSizeInBits &&
9940 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9941 "Illegal shuffle lane size");
9942 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9943 int Size = Mask.size();
9944 for (int i = 0; i < Size; ++i)
9945 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9946 return true;
9947 return false;
9948}
9949
9950/// Test whether there are elements crossing 128-bit lanes in this
9951/// shuffle mask.
9953 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9954}
9955
9956/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9957/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9958/// better support 'repeated mask + lane permute' style shuffles.
9959static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9960 unsigned ScalarSizeInBits,
9961 ArrayRef<int> Mask) {
9962 assert(LaneSizeInBits && ScalarSizeInBits &&
9963 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9964 "Illegal shuffle lane size");
9965 int NumElts = Mask.size();
9966 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9967 int NumLanes = NumElts / NumEltsPerLane;
9968 if (NumLanes > 1) {
9969 for (int i = 0; i != NumLanes; ++i) {
9970 int SrcLane = -1;
9971 for (int j = 0; j != NumEltsPerLane; ++j) {
9972 int M = Mask[(i * NumEltsPerLane) + j];
9973 if (M < 0)
9974 continue;
9975 int Lane = (M % NumElts) / NumEltsPerLane;
9976 if (SrcLane >= 0 && SrcLane != Lane)
9977 return true;
9978 SrcLane = Lane;
9979 }
9980 }
9981 }
9982 return false;
9983}
9984
9985/// Test whether a shuffle mask is equivalent within each sub-lane.
9986///
9987/// This checks a shuffle mask to see if it is performing the same
9988/// lane-relative shuffle in each sub-lane. This trivially implies
9989/// that it is also not lane-crossing. It may however involve a blend from the
9990/// same lane of a second vector.
9991///
9992/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9993/// non-trivial to compute in the face of undef lanes. The representation is
9994/// suitable for use with existing 128-bit shuffles as entries from the second
9995/// vector have been remapped to [LaneSize, 2*LaneSize).
9996static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9997 ArrayRef<int> Mask,
9998 SmallVectorImpl<int> &RepeatedMask) {
9999 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
10000 RepeatedMask.assign(LaneSize, -1);
10001 int Size = Mask.size();
10002 for (int i = 0; i < Size; ++i) {
10003 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
10004 if (Mask[i] < 0)
10005 continue;
10006 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10007 // This entry crosses lanes, so there is no way to model this shuffle.
10008 return false;
10009
10010 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10011 // Adjust second vector indices to start at LaneSize instead of Size.
10012 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10013 : Mask[i] % LaneSize + LaneSize;
10014 if (RepeatedMask[i % LaneSize] < 0)
10015 // This is the first non-undef entry in this slot of a 128-bit lane.
10016 RepeatedMask[i % LaneSize] = LocalM;
10017 else if (RepeatedMask[i % LaneSize] != LocalM)
10018 // Found a mismatch with the repeated mask.
10019 return false;
10020 }
10021 return true;
10022}
10023
10024/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10025static bool
10027 SmallVectorImpl<int> &RepeatedMask) {
10028 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10029}
10030
10031static bool
10033 SmallVector<int, 32> RepeatedMask;
10034 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10035}
10036
10037/// Test whether a shuffle mask is equivalent within each 256-bit lane.
10038static bool
10040 SmallVectorImpl<int> &RepeatedMask) {
10041 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
10042}
10043
10044/// Test whether a target shuffle mask is equivalent within each sub-lane.
10045/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10046static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
10047 unsigned EltSizeInBits,
10048 ArrayRef<int> Mask,
10049 SmallVectorImpl<int> &RepeatedMask) {
10050 int LaneSize = LaneSizeInBits / EltSizeInBits;
10051 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
10052 int Size = Mask.size();
10053 for (int i = 0; i < Size; ++i) {
10054 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
10055 if (Mask[i] == SM_SentinelUndef)
10056 continue;
10057 if (Mask[i] == SM_SentinelZero) {
10058 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
10059 return false;
10060 RepeatedMask[i % LaneSize] = SM_SentinelZero;
10061 continue;
10062 }
10063 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10064 // This entry crosses lanes, so there is no way to model this shuffle.
10065 return false;
10066
10067 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
10068 // later vector indices to start at multiples of LaneSize instead of Size.
10069 int LaneM = Mask[i] / Size;
10070 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
10071 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
10072 // This is the first non-undef entry in this slot of a 128-bit lane.
10073 RepeatedMask[i % LaneSize] = LocalM;
10074 else if (RepeatedMask[i % LaneSize] != LocalM)
10075 // Found a mismatch with the repeated mask.
10076 return false;
10077 }
10078 return true;
10079}
10080
10081/// Test whether a target shuffle mask is equivalent within each sub-lane.
10082/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10083static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
10084 ArrayRef<int> Mask,
10085 SmallVectorImpl<int> &RepeatedMask) {
10086 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
10087 Mask, RepeatedMask);
10088}
10089
10090/// Checks whether a shuffle mask is equivalent to an explicit list of
10091/// arguments.
10092///
10093/// This is a fast way to test a shuffle mask against a fixed pattern:
10094///
10095/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
10096///
10097/// It returns true if the mask is exactly as wide as the argument list, and
10098/// each element of the mask is either -1 (signifying undef) or the value given
10099/// in the argument.
10100static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
10101 SDValue V1 = SDValue(),
10102 SDValue V2 = SDValue()) {
10103 int Size = Mask.size();
10104 if (Size != (int)ExpectedMask.size())
10105 return false;
10106
10107 for (int i = 0; i < Size; ++i) {
10108 assert(Mask[i] >= -1 && "Out of bound mask element!");
10109 int MaskIdx = Mask[i];
10110 int ExpectedIdx = ExpectedMask[i];
10111 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
10112 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10113 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10114 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10115 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10116 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10117 return false;
10118 }
10119 }
10120 return true;
10121}
10122
10123/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
10124///
10125/// The masks must be exactly the same width.
10126///
10127/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
10128/// value in ExpectedMask is always accepted. Otherwise the indices must match.
10129///
10130/// SM_SentinelZero is accepted as a valid negative index but must match in
10131/// both, or via a known bits test.
10133 ArrayRef<int> ExpectedMask,
10134 const SelectionDAG &DAG,
10135 SDValue V1 = SDValue(),
10136 SDValue V2 = SDValue()) {
10137 int Size = Mask.size();
10138 if (Size != (int)ExpectedMask.size())
10139 return false;
10140 assert(llvm::all_of(ExpectedMask,
10141 [Size](int M) {
10142 return M == SM_SentinelZero ||
10143 isInRange(M, 0, 2 * Size);
10144 }) &&
10145 "Illegal target shuffle mask");
10146
10147 // Check for out-of-range target shuffle mask indices.
10148 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
10149 return false;
10150
10151 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
10152 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
10153 !V1.getValueType().isVector()))
10154 V1 = SDValue();
10155 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
10156 !V2.getValueType().isVector()))
10157 V2 = SDValue();
10158
10159 APInt ZeroV1 = APInt::getZero(Size);
10160 APInt ZeroV2 = APInt::getZero(Size);
10161
10162 for (int i = 0; i < Size; ++i) {
10163 int MaskIdx = Mask[i];
10164 int ExpectedIdx = ExpectedMask[i];
10165 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
10166 continue;
10167 // If we failed to match an expected SM_SentinelZero then early out.
10168 if (ExpectedIdx < 0)
10169 return false;
10170 if (MaskIdx == SM_SentinelZero) {
10171 // If we need this expected index to be a zero element, then update the
10172 // relevant zero mask and perform the known bits at the end to minimize
10173 // repeated computes.
10174 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10175 if (ExpectedV &&
10176 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
10177 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10178 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
10179 ZeroMask.setBit(BitIdx);
10180 continue;
10181 }
10182 }
10183 if (MaskIdx >= 0) {
10184 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10185 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10186 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10187 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10188 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10189 continue;
10190 }
10191 return false;
10192 }
10193 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
10194 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
10195}
10196
10197// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
10198// instructions.
10200 const SelectionDAG &DAG) {
10201 if (VT != MVT::v8i32 && VT != MVT::v8f32)
10202 return false;
10203
10204 SmallVector<int, 8> Unpcklwd;
10205 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
10206 /* Unary = */ false);
10207 SmallVector<int, 8> Unpckhwd;
10208 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
10209 /* Unary = */ false);
10210 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
10211 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
10212 return IsUnpackwdMask;
10213}
10214
10216 const SelectionDAG &DAG) {
10217 // Create 128-bit vector type based on mask size.
10218 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
10219 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
10220
10221 // We can't assume a canonical shuffle mask, so try the commuted version too.
10222 SmallVector<int, 4> CommutedMask(Mask);
10224
10225 // Match any of unary/binary or low/high.
10226 for (unsigned i = 0; i != 4; ++i) {
10227 SmallVector<int, 16> UnpackMask;
10228 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
10229 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
10230 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
10231 return true;
10232 }
10233 return false;
10234}
10235
10236/// Return true if a shuffle mask chooses elements identically in its top and
10237/// bottom halves. For example, any splat mask has the same top and bottom
10238/// halves. If an element is undefined in only one half of the mask, the halves
10239/// are not considered identical.
10241 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
10242 unsigned HalfSize = Mask.size() / 2;
10243 for (unsigned i = 0; i != HalfSize; ++i) {
10244 if (Mask[i] != Mask[i + HalfSize])
10245 return false;
10246 }
10247 return true;
10248}
10249
10250/// Get a 4-lane 8-bit shuffle immediate for a mask.
10251///
10252/// This helper function produces an 8-bit shuffle immediate corresponding to
10253/// the ubiquitous shuffle encoding scheme used in x86 instructions for
10254/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
10255/// example.
10256///
10257/// NB: We rely heavily on "undef" masks preserving the input lane.
10258static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
10259 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
10260 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
10261 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
10262 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
10263 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
10264
10265 // If the mask only uses one non-undef element, then fully 'splat' it to
10266 // improve later broadcast matching.
10267 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10268 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
10269
10270 int FirstElt = Mask[FirstIndex];
10271 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
10272 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10273
10274 unsigned Imm = 0;
10275 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10276 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10277 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10278 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10279 return Imm;
10280}
10281
10283 SelectionDAG &DAG) {
10284 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10285}
10286
10287// Canonicalize SHUFPD mask to improve chances of further folding.
10288// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10289static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
10290 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10291 "Unexpected SHUFPD mask size");
10292 assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10293 "Unexpected SHUFPD mask elements");
10294
10295 // If the mask only uses one non-undef element, then fully 'splat' it to
10296 // improve later broadcast matching.
10297 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10298 assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&
10299 "All undef shuffle mask");
10300
10301 int FirstElt = Mask[FirstIndex];
10302 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
10303 count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
10304 unsigned Imm = 0;
10305 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10306 Imm |= FirstElt << I;
10307 return Imm;
10308 }
10309
10310 // Attempt to keep any undef elements in place to improve chances of the
10311 // shuffle becoming a (commutative) blend.
10312 unsigned Imm = 0;
10313 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10314 Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
10315
10316 return Imm;
10317}
10318
10320 SelectionDAG &DAG) {
10321 return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
10322}
10323
10324// The Shuffle result is as follow:
10325// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10326// Each Zeroable's element correspond to a particular Mask's element.
10327// As described in computeZeroableShuffleElements function.
10328//
10329// The function looks for a sub-mask that the nonzero elements are in
10330// increasing order. If such sub-mask exist. The function returns true.
10331static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10332 ArrayRef<int> Mask, const EVT &VectorType,
10333 bool &IsZeroSideLeft) {
10334 int NextElement = -1;
10335 // Check if the Mask's nonzero elements are in increasing order.
10336 for (int i = 0, e = Mask.size(); i < e; i++) {
10337 // Checks if the mask's zeros elements are built from only zeros.
10338 assert(Mask[i] >= -1 && "Out of bound mask element!");
10339 if (Mask[i] < 0)
10340 return false;
10341 if (Zeroable[i])
10342 continue;
10343 // Find the lowest non zero element
10344 if (NextElement < 0) {
10345 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10346 IsZeroSideLeft = NextElement != 0;
10347 }
10348 // Exit if the mask's non zero elements are not in increasing order.
10349 if (NextElement != Mask[i])
10350 return false;
10351 NextElement++;
10352 }
10353 return true;
10354}
10355
10356static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
10358 const X86Subtarget &Subtarget,
10359 unsigned Depth = 0);
10360
10361/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10363 ArrayRef<int> Mask, SDValue V1,
10364 SDValue V2, const APInt &Zeroable,
10365 const X86Subtarget &Subtarget,
10366 SelectionDAG &DAG) {
10367 int Size = Mask.size();
10368 int LaneSize = 128 / VT.getScalarSizeInBits();
10369 const int NumBytes = VT.getSizeInBits() / 8;
10370 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10371
10372 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
10373 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
10374 (Subtarget.hasBWI() && VT.is512BitVector()));
10375
10376 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10377 // Sign bit set in i8 mask means zero element.
10378 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10379
10380 SDValue V;
10381 for (int i = 0; i < NumBytes; ++i) {
10382 int M = Mask[i / NumEltBytes];
10383 if (M < 0) {
10384 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10385 continue;
10386 }
10387 if (Zeroable[i / NumEltBytes]) {
10388 PSHUFBMask[i] = ZeroMask;
10389 continue;
10390 }
10391
10392 // We can only use a single input of V1 or V2.
10393 SDValue SrcV = (M >= Size ? V2 : V1);
10394 if (V && V != SrcV)
10395 return SDValue();
10396 V = SrcV;
10397 M %= Size;
10398
10399 // PSHUFB can't cross lanes, ensure this doesn't happen.
10400 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10401 return SDValue();
10402
10403 M = M % LaneSize;
10404 M = M * NumEltBytes + (i % NumEltBytes);
10405 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10406 }
10407 assert(V && "Failed to find a source input");
10408
10409 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10410 return DAG.getBitcast(
10411 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10412 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10413}
10414
10415static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10416 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10417 const SDLoc &dl);
10418
10419// X86 has dedicated shuffle that can be lowered to VEXPAND
10421 SDValue V2, ArrayRef<int> Mask,
10422 const APInt &Zeroable,
10423 const X86Subtarget &Subtarget,
10424 SelectionDAG &DAG) {
10425 bool IsLeftZeroSide = true;
10426 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10427 IsLeftZeroSide))
10428 return SDValue();
10429 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10431 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10432 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10433 unsigned NumElts = VT.getVectorNumElements();
10434 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10435 "Unexpected number of vector elements");
10436 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10437 Subtarget, DAG, DL);
10438 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10439 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10440 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10441}
10442
10443static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
10444 unsigned &UnpackOpcode, bool IsUnary,
10445 ArrayRef<int> TargetMask, const SDLoc &DL,
10446 SelectionDAG &DAG,
10447 const X86Subtarget &Subtarget) {
10448 int NumElts = VT.getVectorNumElements();
10449
10450 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
10451 for (int i = 0; i != NumElts; i += 2) {
10452 int M1 = TargetMask[i + 0];
10453 int M2 = TargetMask[i + 1];
10454 Undef1 &= (SM_SentinelUndef == M1);
10455 Undef2 &= (SM_SentinelUndef == M2);
10456 Zero1 &= isUndefOrZero(M1);
10457 Zero2 &= isUndefOrZero(M2);
10458 }
10459 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
10460 "Zeroable shuffle detected");
10461
10462 // Attempt to match the target mask against the unpack lo/hi mask patterns.
10463 SmallVector<int, 64> Unpckl, Unpckh;
10464 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
10465 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
10466 (IsUnary ? V1 : V2))) {
10467 UnpackOpcode = X86ISD::UNPCKL;
10468 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10469 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10470 return true;
10471 }
10472
10473 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
10474 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
10475 (IsUnary ? V1 : V2))) {
10476 UnpackOpcode = X86ISD::UNPCKH;
10477 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10478 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10479 return true;
10480 }
10481
10482 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
10483 if (IsUnary && (Zero1 || Zero2)) {
10484 // Don't bother if we can blend instead.
10485 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10486 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
10487 return false;
10488
10489 bool MatchLo = true, MatchHi = true;
10490 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10491 int M = TargetMask[i];
10492
10493 // Ignore if the input is known to be zero or the index is undef.
10494 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10495 (M == SM_SentinelUndef))
10496 continue;
10497
10498 MatchLo &= (M == Unpckl[i]);
10499 MatchHi &= (M == Unpckh[i]);
10500 }
10501
10502 if (MatchLo || MatchHi) {
10503 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10504 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10505 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10506 return true;
10507 }
10508 }
10509
10510 // If a binary shuffle, commute and try again.
10511 if (!IsUnary) {
10513 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
10514 UnpackOpcode = X86ISD::UNPCKL;
10515 std::swap(V1, V2);
10516 return true;
10517 }
10518
10520 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
10521 UnpackOpcode = X86ISD::UNPCKH;
10522 std::swap(V1, V2);
10523 return true;
10524 }
10525 }
10526
10527 return false;
10528}
10529
10530// X86 has dedicated unpack instructions that can handle specific blend
10531// operations: UNPCKH and UNPCKL.
10533 SDValue V2, ArrayRef<int> Mask,
10534 SelectionDAG &DAG) {
10535 SmallVector<int, 8> Unpckl;
10536 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10537 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10538 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10539
10540 SmallVector<int, 8> Unpckh;
10541 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10542 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10543 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10544
10545 // Commute and try again.
10547 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10548 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10549
10551 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10552 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10553
10554 return SDValue();
10555}
10556
10557/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10558/// followed by unpack 256-bit.
10560 SDValue V2, ArrayRef<int> Mask,
10561 SelectionDAG &DAG) {
10562 SmallVector<int, 32> Unpckl, Unpckh;
10563 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10564 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10565
10566 unsigned UnpackOpcode;
10567 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10568 UnpackOpcode = X86ISD::UNPCKL;
10569 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10570 UnpackOpcode = X86ISD::UNPCKH;
10571 else
10572 return SDValue();
10573
10574 // This is a "natural" unpack operation (rather than the 128-bit sectored
10575 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10576 // input in order to use the x86 instruction.
10577 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10578 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10579 V1 = DAG.getBitcast(VT, V1);
10580 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10581}
10582
10583// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10584// source into the lower elements and zeroing the upper elements.
10585static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10586 ArrayRef<int> Mask, const APInt &Zeroable,
10587 const X86Subtarget &Subtarget) {
10588 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10589 return false;
10590
10591 unsigned NumElts = Mask.size();
10592 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10593 unsigned MaxScale = 64 / EltSizeInBits;
10594
10595 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10596 unsigned SrcEltBits = EltSizeInBits * Scale;
10597 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10598 continue;
10599 unsigned NumSrcElts = NumElts / Scale;
10600 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10601 continue;
10602 unsigned UpperElts = NumElts - NumSrcElts;
10603 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10604 continue;
10605 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10606 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10607 DstVT = MVT::getIntegerVT(EltSizeInBits);
10608 if ((NumSrcElts * EltSizeInBits) >= 128) {
10609 // ISD::TRUNCATE
10610 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10611 } else {
10612 // X86ISD::VTRUNC
10613 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10614 }
10615 return true;
10616 }
10617
10618 return false;
10619}
10620
10621// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10622// element padding to the final DstVT.
10623static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10624 const X86Subtarget &Subtarget,
10625 SelectionDAG &DAG, bool ZeroUppers) {
10626 MVT SrcVT = Src.getSimpleValueType();
10627 MVT DstSVT = DstVT.getScalarType();
10628 unsigned NumDstElts = DstVT.getVectorNumElements();
10629 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10630 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10631
10632 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10633 return SDValue();
10634
10635 // Perform a direct ISD::TRUNCATE if possible.
10636 if (NumSrcElts == NumDstElts)
10637 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10638
10639 if (NumSrcElts > NumDstElts) {
10640 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10641 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10642 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10643 }
10644
10645 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10646 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10647 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10648 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10649 DstVT.getSizeInBits());
10650 }
10651
10652 // Non-VLX targets must truncate from a 512-bit type, so we need to
10653 // widen, truncate and then possibly extract the original subvector.
10654 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10655 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10656 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10657 }
10658
10659 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10660 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10661 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10662 if (DstVT != TruncVT)
10663 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10664 DstVT.getSizeInBits());
10665 return Trunc;
10666}
10667
10668// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10669//
10670// An example is the following:
10671//
10672// t0: ch = EntryToken
10673// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10674// t25: v4i32 = truncate t2
10675// t41: v8i16 = bitcast t25
10676// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10677// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10678// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10679// t18: v2i64 = bitcast t51
10680//
10681// One can just use a single vpmovdw instruction, without avx512vl we need to
10682// use the zmm variant and extract the lower subvector, padding with zeroes.
10683// TODO: Merge with lowerShuffleAsVTRUNC.
10685 SDValue V2, ArrayRef<int> Mask,
10686 const APInt &Zeroable,
10687 const X86Subtarget &Subtarget,
10688 SelectionDAG &DAG) {
10689 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10690 if (!Subtarget.hasAVX512())
10691 return SDValue();
10692
10693 unsigned NumElts = VT.getVectorNumElements();
10694 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10695 unsigned MaxScale = 64 / EltSizeInBits;
10696 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10697 unsigned SrcEltBits = EltSizeInBits * Scale;
10698 unsigned NumSrcElts = NumElts / Scale;
10699 unsigned UpperElts = NumElts - NumSrcElts;
10700 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10701 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10702 continue;
10703
10704 // Attempt to find a matching source truncation, but as a fall back VLX
10705 // cases can use the VPMOV directly.
10706 SDValue Src = peekThroughBitcasts(V1);
10707 if (Src.getOpcode() == ISD::TRUNCATE &&
10708 Src.getScalarValueSizeInBits() == SrcEltBits) {
10709 Src = Src.getOperand(0);
10710 } else if (Subtarget.hasVLX()) {
10711 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10712 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10713 Src = DAG.getBitcast(SrcVT, Src);
10714 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10715 if (Scale == 2 &&
10716 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10717 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10718 return SDValue();
10719 } else
10720 return SDValue();
10721
10722 // VPMOVWB is only available with avx512bw.
10723 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10724 return SDValue();
10725
10726 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10727 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10728 }
10729
10730 return SDValue();
10731}
10732
10733// Attempt to match binary shuffle patterns as a truncate.
10735 SDValue V2, ArrayRef<int> Mask,
10736 const APInt &Zeroable,
10737 const X86Subtarget &Subtarget,
10738 SelectionDAG &DAG) {
10739 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10740 "Unexpected VTRUNC type");
10741 if (!Subtarget.hasAVX512() ||
10742 (VT.is256BitVector() && !Subtarget.useAVX512Regs()))
10743 return SDValue();
10744
10745 unsigned NumElts = VT.getVectorNumElements();
10746 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10747 unsigned MaxScale = 64 / EltSizeInBits;
10748 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10749 // TODO: Support non-BWI VPMOVWB truncations?
10750 unsigned SrcEltBits = EltSizeInBits * Scale;
10751 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10752 continue;
10753
10754 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10755 // Bail if the V2 elements are undef.
10756 unsigned NumHalfSrcElts = NumElts / Scale;
10757 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10758 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10759 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10760 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10761 continue;
10762
10763 // The elements beyond the truncation must be undef/zero.
10764 unsigned UpperElts = NumElts - NumSrcElts;
10765 if (UpperElts > 0 &&
10766 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10767 continue;
10768 bool UndefUppers =
10769 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10770
10771 // As we're using both sources then we need to concat them together
10772 // and truncate from the double-sized src.
10773 MVT ConcatVT = VT.getDoubleNumVectorElementsVT();
10774
10775 // For offset truncations, ensure that the concat is cheap.
10776 SDValue Src =
10777 combineConcatVectorOps(DL, ConcatVT, {V1, V2}, DAG, Subtarget);
10778 if (!Src) {
10779 if (Offset)
10780 continue;
10781 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10782 }
10783
10784 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10785 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10786 Src = DAG.getBitcast(SrcVT, Src);
10787
10788 // Shift the offset'd elements into place for the truncation.
10789 // TODO: Use getTargetVShiftByConstNode.
10790 if (Offset)
10791 Src = DAG.getNode(
10792 X86ISD::VSRLI, DL, SrcVT, Src,
10793 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10794
10795 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10796 }
10797 }
10798
10799 return SDValue();
10800}
10801
10802/// Check whether a compaction lowering can be done by dropping even/odd
10803/// elements and compute how many times even/odd elements must be dropped.
10804///
10805/// This handles shuffles which take every Nth element where N is a power of
10806/// two. Example shuffle masks:
10807///
10808/// (even)
10809/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10810/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10811/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10812/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10813/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10814/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10815///
10816/// (odd)
10817/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10818/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10819///
10820/// Any of these lanes can of course be undef.
10821///
10822/// This routine only supports N <= 3.
10823/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10824/// for larger N.
10825///
10826/// \returns N above, or the number of times even/odd elements must be dropped
10827/// if there is such a number. Otherwise returns zero.
10828static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10829 bool IsSingleInput) {
10830 // The modulus for the shuffle vector entries is based on whether this is
10831 // a single input or not.
10832 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10833 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10834 "We should only be called with masks with a power-of-2 size!");
10835
10836 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10837 int Offset = MatchEven ? 0 : 1;
10838
10839 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10840 // and 2^3 simultaneously. This is because we may have ambiguity with
10841 // partially undef inputs.
10842 bool ViableForN[3] = {true, true, true};
10843
10844 for (int i = 0, e = Mask.size(); i < e; ++i) {
10845 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10846 // want.
10847 if (Mask[i] < 0)
10848 continue;
10849
10850 bool IsAnyViable = false;
10851 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10852 if (ViableForN[j]) {
10853 uint64_t N = j + 1;
10854
10855 // The shuffle mask must be equal to (i * 2^N) % M.
10856 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10857 IsAnyViable = true;
10858 else
10859 ViableForN[j] = false;
10860 }
10861 // Early exit if we exhaust the possible powers of two.
10862 if (!IsAnyViable)
10863 break;
10864 }
10865
10866 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10867 if (ViableForN[j])
10868 return j + 1;
10869
10870 // Return 0 as there is no viable power of two.
10871 return 0;
10872}
10873
10874// X86 has dedicated pack instructions that can handle specific truncation
10875// operations: PACKSS and PACKUS.
10876// Checks for compaction shuffle masks if MaxStages > 1.
10877// TODO: Add support for matching multiple PACKSS/PACKUS stages.
10878static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10879 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10880 const SelectionDAG &DAG,
10881 const X86Subtarget &Subtarget,
10882 unsigned MaxStages = 1) {
10883 unsigned NumElts = VT.getVectorNumElements();
10884 unsigned BitSize = VT.getScalarSizeInBits();
10885 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10886 "Illegal maximum compaction");
10887
10888 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10889 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10890 unsigned NumPackedBits = NumSrcBits - BitSize;
10891 N1 = peekThroughBitcasts(N1);
10892 N2 = peekThroughBitcasts(N2);
10893 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10894 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10895 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10896 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10897 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10898 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10899 return false;
10900 if (Subtarget.hasSSE41() || BitSize == 8) {
10901 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10902 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10903 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10904 V1 = N1;
10905 V2 = N2;
10906 SrcVT = PackVT;
10907 PackOpcode = X86ISD::PACKUS;
10908 return true;
10909 }
10910 }
10911 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10912 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10913 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10914 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10915 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10916 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10917 V1 = N1;
10918 V2 = N2;
10919 SrcVT = PackVT;
10920 PackOpcode = X86ISD::PACKSS;
10921 return true;
10922 }
10923 return false;
10924 };
10925
10926 // Attempt to match against wider and wider compaction patterns.
10927 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10928 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10929 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10930
10931 // Try binary shuffle.
10932 SmallVector<int, 32> BinaryMask;
10933 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10934 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10935 if (MatchPACK(V1, V2, PackVT))
10936 return true;
10937
10938 // Try unary shuffle.
10939 SmallVector<int, 32> UnaryMask;
10940 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10941 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10942 if (MatchPACK(V1, V1, PackVT))
10943 return true;
10944 }
10945
10946 return false;
10947}
10948
10950 SDValue V2, ArrayRef<int> Mask,
10951 const X86Subtarget &Subtarget,
10952 SelectionDAG &DAG) {
10953 MVT PackVT;
10954 unsigned PackOpcode;
10955 unsigned SizeBits = VT.getSizeInBits();
10956 unsigned EltBits = VT.getScalarSizeInBits();
10957 unsigned MaxStages = Log2_32(64 / EltBits);
10958 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10959 Subtarget, MaxStages))
10960 return SDValue();
10961
10962 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10963 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10964
10965 // Don't lower multi-stage packs on AVX512, truncation is better.
10966 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10967 return SDValue();
10968
10969 // Pack to the largest type possible:
10970 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10971 unsigned MaxPackBits = 16;
10972 if (CurrentEltBits > 16 &&
10973 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10974 MaxPackBits = 32;
10975
10976 // Repeatedly pack down to the target size.
10977 SDValue Res;
10978 for (unsigned i = 0; i != NumStages; ++i) {
10979 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10980 unsigned NumSrcElts = SizeBits / SrcEltBits;
10981 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10982 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10983 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10984 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10985 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10986 DAG.getBitcast(SrcVT, V2));
10987 V1 = V2 = Res;
10988 CurrentEltBits /= 2;
10989 }
10990 assert(Res && Res.getValueType() == VT &&
10991 "Failed to lower compaction shuffle");
10992 return Res;
10993}
10994
10995/// Try to emit a bitmask instruction for a shuffle.
10996///
10997/// This handles cases where we can model a blend exactly as a bitmask due to
10998/// one of the inputs being zeroable.
11000 SDValue V2, ArrayRef<int> Mask,
11001 const APInt &Zeroable,
11002 const X86Subtarget &Subtarget,
11003 SelectionDAG &DAG) {
11004 MVT MaskVT = VT;
11005 MVT EltVT = VT.getVectorElementType();
11006 SDValue Zero, AllOnes;
11007 // Use f64 if i64 isn't legal.
11008 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11009 EltVT = MVT::f64;
11010 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11011 }
11012
11013 MVT LogicVT = VT;
11014 if (EltVT.isFloatingPoint()) {
11015 Zero = DAG.getConstantFP(0.0, DL, EltVT);
11016 APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics());
11017 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11018 LogicVT = MVT::getVectorVT(EltVT.changeTypeToInteger(), Mask.size());
11019 } else {
11020 Zero = DAG.getConstant(0, DL, EltVT);
11021 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11022 }
11023
11024 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11025 SDValue V;
11026 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11027 if (Zeroable[i])
11028 continue;
11029 if (Mask[i] % Size != i)
11030 return SDValue(); // Not a blend.
11031 if (!V)
11032 V = Mask[i] < Size ? V1 : V2;
11033 else if (V != (Mask[i] < Size ? V1 : V2))
11034 return SDValue(); // Can only let one input through the mask.
11035
11036 VMaskOps[i] = AllOnes;
11037 }
11038 if (!V)
11039 return SDValue(); // No non-zeroable elements!
11040
11041 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11042 VMask = DAG.getBitcast(LogicVT, VMask);
11043 V = DAG.getBitcast(LogicVT, V);
11044 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11045 return DAG.getBitcast(VT, And);
11046}
11047
11048/// Try to emit a blend instruction for a shuffle using bit math.
11049///
11050/// This is used as a fallback approach when first class blend instructions are
11051/// unavailable. Currently it is only suitable for integer vectors, but could
11052/// be generalized for floating point vectors if desirable.
11054 SDValue V2, ArrayRef<int> Mask,
11055 SelectionDAG &DAG) {
11056 assert(VT.isInteger() && "Only supports integer vector types!");
11057 MVT EltVT = VT.getVectorElementType();
11058 SDValue Zero = DAG.getConstant(0, DL, EltVT);
11059 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11061 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11062 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
11063 return SDValue(); // Shuffled input!
11064 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
11065 }
11066
11067 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
11068 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
11069}
11070
11072 SDValue PreservedSrc,
11073 const X86Subtarget &Subtarget,
11074 SelectionDAG &DAG);
11075
11078 const APInt &Zeroable, bool &ForceV1Zero,
11079 bool &ForceV2Zero, uint64_t &BlendMask) {
11080 bool V1IsZeroOrUndef =
11082 bool V2IsZeroOrUndef =
11084
11085 BlendMask = 0;
11086 ForceV1Zero = false, ForceV2Zero = false;
11087 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
11088
11089 int NumElts = Mask.size();
11090 int NumLanes = VT.getSizeInBits() / 128;
11091 int NumEltsPerLane = NumElts / NumLanes;
11092 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
11093
11094 // For 32/64-bit elements, if we only reference one input (plus any undefs),
11095 // then ensure the blend mask part for that lane just references that input.
11096 bool ForceWholeLaneMasks =
11097 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
11098
11099 // Attempt to generate the binary blend mask. If an input is zero then
11100 // we can use any lane.
11101 for (int Lane = 0; Lane != NumLanes; ++Lane) {
11102 // Keep track of the inputs used per lane.
11103 bool LaneV1InUse = false;
11104 bool LaneV2InUse = false;
11105 uint64_t LaneBlendMask = 0;
11106 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
11107 int Elt = (Lane * NumEltsPerLane) + LaneElt;
11108 int M = Mask[Elt];
11109 if (M == SM_SentinelUndef)
11110 continue;
11111 if (M == Elt || (0 <= M && M < NumElts &&
11112 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
11113 Mask[Elt] = Elt;
11114 LaneV1InUse = true;
11115 continue;
11116 }
11117 if (M == (Elt + NumElts) ||
11118 (NumElts <= M &&
11119 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
11120 LaneBlendMask |= 1ull << LaneElt;
11121 Mask[Elt] = Elt + NumElts;
11122 LaneV2InUse = true;
11123 continue;
11124 }
11125 if (Zeroable[Elt]) {
11126 if (V1IsZeroOrUndef) {
11127 ForceV1Zero = true;
11128 Mask[Elt] = Elt;
11129 LaneV1InUse = true;
11130 continue;
11131 }
11132 if (V2IsZeroOrUndef) {
11133 ForceV2Zero = true;
11134 LaneBlendMask |= 1ull << LaneElt;
11135 Mask[Elt] = Elt + NumElts;
11136 LaneV2InUse = true;
11137 continue;
11138 }
11139 }
11140 return false;
11141 }
11142
11143 // If we only used V2 then splat the lane blend mask to avoid any demanded
11144 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
11145 // blend mask bit).
11146 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
11147 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
11148
11149 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
11150 }
11151 return true;
11152}
11153
11154/// Try to emit a blend instruction for a shuffle.
11155///
11156/// This doesn't do any checks for the availability of instructions for blending
11157/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
11158/// be matched in the backend with the type given. What it does check for is
11159/// that the shuffle mask is a blend, or convertible into a blend with zero.
11161 SDValue V2, ArrayRef<int> Original,
11162 const APInt &Zeroable,
11163 const X86Subtarget &Subtarget,
11164 SelectionDAG &DAG) {
11165 uint64_t BlendMask = 0;
11166 bool ForceV1Zero = false, ForceV2Zero = false;
11167 SmallVector<int, 64> Mask(Original);
11168 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
11169 BlendMask))
11170 return SDValue();
11171
11172 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
11173 if (ForceV1Zero)
11174 V1 = getZeroVector(VT, Subtarget, DAG, DL);
11175 if (ForceV2Zero)
11176 V2 = getZeroVector(VT, Subtarget, DAG, DL);
11177
11178 unsigned NumElts = VT.getVectorNumElements();
11179
11180 switch (VT.SimpleTy) {
11181 case MVT::v4i64:
11182 case MVT::v8i32:
11183 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
11184 [[fallthrough]];
11185 case MVT::v4f64:
11186 case MVT::v8f32:
11187 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
11188 [[fallthrough]];
11189 case MVT::v2f64:
11190 case MVT::v2i64:
11191 case MVT::v4f32:
11192 case MVT::v4i32:
11193 case MVT::v8i16:
11194 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
11195 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
11196 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11197 case MVT::v16i16: {
11198 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
11199 SmallVector<int, 8> RepeatedMask;
11200 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11201 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
11202 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
11203 BlendMask = 0;
11204 for (int i = 0; i < 8; ++i)
11205 if (RepeatedMask[i] >= 8)
11206 BlendMask |= 1ull << i;
11207 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11208 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11209 }
11210 // Use PBLENDW for lower/upper lanes and then blend lanes.
11211 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
11212 // merge to VSELECT where useful.
11213 uint64_t LoMask = BlendMask & 0xFF;
11214 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11215 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11216 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11217 DAG.getTargetConstant(LoMask, DL, MVT::i8));
11218 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11219 DAG.getTargetConstant(HiMask, DL, MVT::i8));
11220 return DAG.getVectorShuffle(
11221 MVT::v16i16, DL, Lo, Hi,
11222 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11223 }
11224 [[fallthrough]];
11225 }
11226 case MVT::v32i8:
11227 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
11228 [[fallthrough]];
11229 case MVT::v16i8: {
11230 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
11231
11232 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
11233 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11234 Subtarget, DAG))
11235 return Masked;
11236
11237 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11238 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11239 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11240 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11241 }
11242
11243 // If we have VPTERNLOG, we can use that as a bit blend.
11244 if (Subtarget.hasVLX())
11245 if (SDValue BitBlend =
11246 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11247 return BitBlend;
11248
11249 // Scale the blend by the number of bytes per element.
11250 int Scale = VT.getScalarSizeInBits() / 8;
11251
11252 // This form of blend is always done on bytes. Compute the byte vector
11253 // type.
11254 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11255
11256 // x86 allows load folding with blendvb from the 2nd source operand. But
11257 // we are still using LLVM select here (see comment below), so that's V1.
11258 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11259 // allow that load-folding possibility.
11260 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11262 std::swap(V1, V2);
11263 }
11264
11265 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11266 // mix of LLVM's code generator and the x86 backend. We tell the code
11267 // generator that boolean values in the elements of an x86 vector register
11268 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11269 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11270 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11271 // of the element (the remaining are ignored) and 0 in that high bit would
11272 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11273 // the LLVM model for boolean values in vector elements gets the relevant
11274 // bit set, it is set backwards and over constrained relative to x86's
11275 // actual model.
11276 SmallVector<SDValue, 32> VSELECTMask;
11277 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11278 for (int j = 0; j < Scale; ++j)
11279 VSELECTMask.push_back(
11280 Mask[i] < 0
11281 ? DAG.getUNDEF(MVT::i8)
11282 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11283
11284 V1 = DAG.getBitcast(BlendVT, V1);
11285 V2 = DAG.getBitcast(BlendVT, V2);
11286 return DAG.getBitcast(
11287 VT,
11288 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11289 V1, V2));
11290 }
11291 case MVT::v16f32:
11292 case MVT::v8f64:
11293 case MVT::v8i64:
11294 case MVT::v16i32:
11295 case MVT::v32i16:
11296 case MVT::v64i8: {
11297 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11298 bool OptForSize = DAG.shouldOptForSize();
11299 if (!OptForSize) {
11300 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11301 Subtarget, DAG))
11302 return Masked;
11303 }
11304
11305 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11306 // masked move.
11307 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11308 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11309 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11310 }
11311 default:
11312 llvm_unreachable("Not a supported integer vector type!");
11313 }
11314}
11315
11316/// Try to lower as a blend of elements from two inputs followed by
11317/// a single-input permutation.
11318///
11319/// This matches the pattern where we can blend elements from two inputs and
11320/// then reduce the shuffle to a single-input permutation.
11322 SDValue V1, SDValue V2,
11323 ArrayRef<int> Mask,
11324 SelectionDAG &DAG,
11325 bool ImmBlends = false) {
11326 // We build up the blend mask while checking whether a blend is a viable way
11327 // to reduce the shuffle.
11328 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11329 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11330
11331 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11332 if (Mask[i] < 0)
11333 continue;
11334
11335 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
11336
11337 if (BlendMask[Mask[i] % Size] < 0)
11338 BlendMask[Mask[i] % Size] = Mask[i];
11339 else if (BlendMask[Mask[i] % Size] != Mask[i])
11340 return SDValue(); // Can't blend in the needed input!
11341
11342 PermuteMask[i] = Mask[i] % Size;
11343 }
11344
11345 // If only immediate blends, then bail if the blend mask can't be widened to
11346 // i16.
11347 unsigned EltSize = VT.getScalarSizeInBits();
11348 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11349 return SDValue();
11350
11351 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11352 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11353}
11354
11355/// Try to lower as an unpack of elements from two inputs followed by
11356/// a single-input permutation.
11357///
11358/// This matches the pattern where we can unpack elements from two inputs and
11359/// then reduce the shuffle to a single-input (wider) permutation.
11361 SDValue V1, SDValue V2,
11362 ArrayRef<int> Mask,
11363 SelectionDAG &DAG) {
11364 int NumElts = Mask.size();
11365 int NumLanes = VT.getSizeInBits() / 128;
11366 int NumLaneElts = NumElts / NumLanes;
11367 int NumHalfLaneElts = NumLaneElts / 2;
11368
11369 bool MatchLo = true, MatchHi = true;
11370 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11371
11372 // Determine UNPCKL/UNPCKH type and operand order.
11373 for (int Elt = 0; Elt != NumElts; ++Elt) {
11374 int M = Mask[Elt];
11375 if (M < 0)
11376 continue;
11377
11378 // Normalize the mask value depending on whether it's V1 or V2.
11379 int NormM = M;
11380 SDValue &Op = Ops[Elt & 1];
11381 if (M < NumElts && (Op.isUndef() || Op == V1))
11382 Op = V1;
11383 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
11384 Op = V2;
11385 NormM -= NumElts;
11386 } else
11387 return SDValue();
11388
11389 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
11390 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11391 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11392 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
11393 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
11394 if (MatchLoAnyLane || MatchHiAnyLane) {
11395 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11396 "Failed to match UNPCKLO/UNPCKHI");
11397 break;
11398 }
11399 }
11400 MatchLo &= MatchLoAnyLane;
11401 MatchHi &= MatchHiAnyLane;
11402 if (!MatchLo && !MatchHi)
11403 return SDValue();
11404 }
11405 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
11406
11407 // Element indices have changed after unpacking. Calculate permute mask
11408 // so that they will be put back to the position as dictated by the
11409 // original shuffle mask indices.
11410 SmallVector<int, 32> PermuteMask(NumElts, -1);
11411 for (int Elt = 0; Elt != NumElts; ++Elt) {
11412 int M = Mask[Elt];
11413 if (M < 0)
11414 continue;
11415 int NormM = M;
11416 if (NumElts <= M)
11417 NormM -= NumElts;
11418 bool IsFirstOp = M < NumElts;
11419 int BaseMaskElt =
11420 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11421 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
11422 PermuteMask[Elt] = BaseMaskElt;
11423 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
11424 PermuteMask[Elt] = BaseMaskElt + 1;
11425 assert(PermuteMask[Elt] != -1 &&
11426 "Input mask element is defined but failed to assign permute mask");
11427 }
11428
11429 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11430 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11431 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11432}
11433
11434/// Try to lower a shuffle as a permute of the inputs followed by an
11435/// UNPCK instruction.
11436///
11437/// This specifically targets cases where we end up with alternating between
11438/// the two inputs, and so can permute them into something that feeds a single
11439/// UNPCK instruction. Note that this routine only targets integer vectors
11440/// because for floating point vectors we have a generalized SHUFPS lowering
11441/// strategy that handles everything that doesn't *exactly* match an unpack,
11442/// making this clever lowering unnecessary.
11444 SDValue V1, SDValue V2,
11445 ArrayRef<int> Mask,
11446 const X86Subtarget &Subtarget,
11447 SelectionDAG &DAG) {
11448 int Size = Mask.size();
11449 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11450
11451 // This routine only supports 128-bit integer dual input vectors.
11452 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
11453 return SDValue();
11454
11455 int NumLoInputs =
11456 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11457 int NumHiInputs =
11458 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11459
11460 bool UnpackLo = NumLoInputs >= NumHiInputs;
11461
11462 auto TryUnpack = [&](int ScalarSize, int Scale) {
11463 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11464 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11465
11466 for (int i = 0; i < Size; ++i) {
11467 if (Mask[i] < 0)
11468 continue;
11469
11470 // Each element of the unpack contains Scale elements from this mask.
11471 int UnpackIdx = i / Scale;
11472
11473 // We only handle the case where V1 feeds the first slots of the unpack.
11474 // We rely on canonicalization to ensure this is the case.
11475 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11476 return SDValue();
11477
11478 // Setup the mask for this input. The indexing is tricky as we have to
11479 // handle the unpack stride.
11480 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11481 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11482 Mask[i] % Size;
11483 }
11484
11485 // If we will have to shuffle both inputs to use the unpack, check whether
11486 // we can just unpack first and shuffle the result. If so, skip this unpack.
11487 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11488 !isNoopShuffleMask(V2Mask))
11489 return SDValue();
11490
11491 // Shuffle the inputs into place.
11492 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11493 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11494
11495 // Cast the inputs to the type we will use to unpack them.
11496 MVT UnpackVT =
11497 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11498 V1 = DAG.getBitcast(UnpackVT, V1);
11499 V2 = DAG.getBitcast(UnpackVT, V2);
11500
11501 // Unpack the inputs and cast the result back to the desired type.
11502 return DAG.getBitcast(
11503 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11504 UnpackVT, V1, V2));
11505 };
11506
11507 // We try each unpack from the largest to the smallest to try and find one
11508 // that fits this mask.
11509 int OrigScalarSize = VT.getScalarSizeInBits();
11510 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11511 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11512 return Unpack;
11513
11514 // If we're shuffling with a zero vector then we're better off not doing
11515 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
11518 return SDValue();
11519
11520 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11521 // initial unpack.
11522 if (NumLoInputs == 0 || NumHiInputs == 0) {
11523 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11524 "We have to have *some* inputs!");
11525 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11526
11527 // FIXME: We could consider the total complexity of the permute of each
11528 // possible unpacking. Or at the least we should consider how many
11529 // half-crossings are created.
11530 // FIXME: We could consider commuting the unpacks.
11531
11532 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11533 for (int i = 0; i < Size; ++i) {
11534 if (Mask[i] < 0)
11535 continue;
11536
11537 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11538
11539 PermMask[i] =
11540 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11541 }
11542 return DAG.getVectorShuffle(
11543 VT, DL,
11544 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11545 V1, V2),
11546 DAG.getUNDEF(VT), PermMask);
11547 }
11548
11549 return SDValue();
11550}
11551
11552/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11553/// permuting the elements of the result in place.
11555 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11556 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11557 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11558 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11559 (VT.is512BitVector() && !Subtarget.hasBWI()))
11560 return SDValue();
11561
11562 // We don't currently support lane crossing permutes.
11563 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11564 return SDValue();
11565
11566 int Scale = VT.getScalarSizeInBits() / 8;
11567 int NumLanes = VT.getSizeInBits() / 128;
11568 int NumElts = VT.getVectorNumElements();
11569 int NumEltsPerLane = NumElts / NumLanes;
11570
11571 // Determine range of mask elts.
11572 bool Blend1 = true;
11573 bool Blend2 = true;
11574 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11575 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11576 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11577 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11578 int M = Mask[Lane + Elt];
11579 if (M < 0)
11580 continue;
11581 if (M < NumElts) {
11582 Blend1 &= (M == (Lane + Elt));
11583 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11584 M = M % NumEltsPerLane;
11585 Range1.first = std::min(Range1.first, M);
11586 Range1.second = std::max(Range1.second, M);
11587 } else {
11588 M -= NumElts;
11589 Blend2 &= (M == (Lane + Elt));
11590 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11591 M = M % NumEltsPerLane;
11592 Range2.first = std::min(Range2.first, M);
11593 Range2.second = std::max(Range2.second, M);
11594 }
11595 }
11596 }
11597
11598 // Bail if we don't need both elements.
11599 // TODO - it might be worth doing this for unary shuffles if the permute
11600 // can be widened.
11601 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11602 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11603 return SDValue();
11604
11605 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11606 return SDValue();
11607
11608 // Rotate the 2 ops so we can access both ranges, then permute the result.
11609 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11610 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11611 SDValue Rotate = DAG.getBitcast(
11612 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11613 DAG.getBitcast(ByteVT, Lo),
11614 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11615 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11616 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11617 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11618 int M = Mask[Lane + Elt];
11619 if (M < 0)
11620 continue;
11621 if (M < NumElts)
11622 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11623 else
11624 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11625 }
11626 }
11627 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11628 };
11629
11630 // Check if the ranges are small enough to rotate from either direction.
11631 if (Range2.second < Range1.first)
11632 return RotateAndPermute(V1, V2, Range1.first, 0);
11633 if (Range1.second < Range2.first)
11634 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11635 return SDValue();
11636}
11637
11639 return isUndefOrEqual(Mask, 0);
11640}
11641
11643 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11644}
11645
11646/// Check if the Mask consists of the same element repeated multiple times.
11648 size_t NumUndefs = 0;
11649 std::optional<int> UniqueElt;
11650 for (int Elt : Mask) {
11651 if (Elt == SM_SentinelUndef) {
11652 NumUndefs++;
11653 continue;
11654 }
11655 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11656 return false;
11657 UniqueElt = Elt;
11658 }
11659 // Make sure the element is repeated enough times by checking the number of
11660 // undefs is small.
11661 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11662}
11663
11664/// Generic routine to decompose a shuffle and blend into independent
11665/// blends and permutes.
11666///
11667/// This matches the extremely common pattern for handling combined
11668/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11669/// operations. It will try to pick the best arrangement of shuffles and
11670/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11672 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11673 const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11674 int NumElts = Mask.size();
11675 int NumLanes = VT.getSizeInBits() / 128;
11676 int NumEltsPerLane = NumElts / NumLanes;
11677
11678 // Shuffle the input elements into the desired positions in V1 and V2 and
11679 // unpack/blend them together.
11680 bool IsAlternating = true;
11681 bool V1Zero = true, V2Zero = true;
11682 SmallVector<int, 32> V1Mask(NumElts, -1);
11683 SmallVector<int, 32> V2Mask(NumElts, -1);
11684 SmallVector<int, 32> FinalMask(NumElts, -1);
11685 for (int i = 0; i < NumElts; ++i) {
11686 int M = Mask[i];
11687 if (M >= 0 && M < NumElts) {
11688 V1Mask[i] = M;
11689 FinalMask[i] = i;
11690 V1Zero &= Zeroable[i];
11691 IsAlternating &= (i & 1) == 0;
11692 } else if (M >= NumElts) {
11693 V2Mask[i] = M - NumElts;
11694 FinalMask[i] = i + NumElts;
11695 V2Zero &= Zeroable[i];
11696 IsAlternating &= (i & 1) == 1;
11697 }
11698 }
11699
11700 // If we effectively only demand the 0'th element of \p Input, and not only
11701 // as 0'th element, then broadcast said input,
11702 // and change \p InputMask to be a no-op (identity) mask.
11703 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11704 &DAG](SDValue &Input,
11705 MutableArrayRef<int> InputMask) {
11706 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11707 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11708 !X86::mayFoldLoad(Input, Subtarget)))
11709 return;
11710 if (isNoopShuffleMask(InputMask))
11711 return;
11712 assert(isBroadcastShuffleMask(InputMask) &&
11713 "Expected to demand only the 0'th element.");
11715 for (auto I : enumerate(InputMask)) {
11716 int &InputMaskElt = I.value();
11717 if (InputMaskElt >= 0)
11718 InputMaskElt = I.index();
11719 }
11720 };
11721
11722 // Currently, we may need to produce one shuffle per input, and blend results.
11723 // It is possible that the shuffle for one of the inputs is already a no-op.
11724 // See if we can simplify non-no-op shuffles into broadcasts,
11725 // which we consider to be strictly better than an arbitrary shuffle.
11726 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11728 canonicalizeBroadcastableInput(V1, V1Mask);
11729 canonicalizeBroadcastableInput(V2, V2Mask);
11730 }
11731
11732 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11733 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11734 // the shuffle may be able to fold with a load or other benefit. However, when
11735 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11736 // pre-shuffle first is a better strategy.
11737 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11738 // If we don't have blends, see if we can create a cheap unpack.
11739 if (!Subtarget.hasSSE41() && VT.is128BitVector() &&
11740 (is128BitUnpackShuffleMask(V1Mask, DAG) ||
11741 is128BitUnpackShuffleMask(V2Mask, DAG)))
11742 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11743 DL, VT, V1, V2, Mask, Subtarget, DAG))
11744 return PermUnpack;
11745
11746 // Only prefer immediate blends to unpack/rotate.
11747 if (SDValue BlendPerm =
11748 lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG, true))
11749 return BlendPerm;
11750
11751 // If either input vector provides only a single element which is repeated
11752 // multiple times, unpacking from both input vectors would generate worse
11753 // code. e.g. for
11754 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11755 // it is better to process t4 first to create a vector of t4[0], then unpack
11756 // that vector with t2.
11757 if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
11759 if (SDValue UnpackPerm =
11760 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11761 return UnpackPerm;
11762
11764 DL, VT, V1, V2, Mask, Subtarget, DAG))
11765 return RotatePerm;
11766
11767 // Unpack/rotate failed - try again with variable blends.
11768 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11769 DAG))
11770 return BlendPerm;
11771
11772 if (VT.getScalarSizeInBits() >= 32)
11773 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11774 DL, VT, V1, V2, Mask, Subtarget, DAG))
11775 return PermUnpack;
11776 }
11777
11778 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11779 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11780 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11781 // than half the elements coming from each source.
11782 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11783 V1Mask.assign(NumElts, -1);
11784 V2Mask.assign(NumElts, -1);
11785 FinalMask.assign(NumElts, -1);
11786 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11787 for (int j = 0; j != NumEltsPerLane; ++j) {
11788 int M = Mask[i + j];
11789 if (M >= 0 && M < NumElts) {
11790 V1Mask[i + (j / 2)] = M;
11791 FinalMask[i + j] = i + (j / 2);
11792 } else if (M >= NumElts) {
11793 V2Mask[i + (j / 2)] = M - NumElts;
11794 FinalMask[i + j] = i + (j / 2) + NumElts;
11795 }
11796 }
11797 }
11798
11799 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11800 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11801 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11802}
11803
11804static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11805 const X86Subtarget &Subtarget,
11806 ArrayRef<int> Mask) {
11807 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11808 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11809
11810 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11811 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11812 int MaxSubElts = 64 / EltSizeInBits;
11813 unsigned RotateAmt, NumSubElts;
11814 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11815 MaxSubElts, NumSubElts, RotateAmt))
11816 return -1;
11817 unsigned NumElts = Mask.size();
11818 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11819 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11820 return RotateAmt;
11821}
11822
11823/// Lower shuffle using X86ISD::VROTLI rotations.
11825 ArrayRef<int> Mask,
11826 const X86Subtarget &Subtarget,
11827 SelectionDAG &DAG) {
11828 // Only XOP + AVX512 targets have bit rotation instructions.
11829 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11830 bool IsLegal =
11831 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11832 if (!IsLegal && Subtarget.hasSSE3())
11833 return SDValue();
11834
11835 MVT RotateVT;
11836 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11837 Subtarget, Mask);
11838 if (RotateAmt < 0)
11839 return SDValue();
11840
11841 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11842 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11843 // widen to vXi16 or more then existing lowering should will be better.
11844 if (!IsLegal) {
11845 if ((RotateAmt % 16) == 0)
11846 return SDValue();
11847 // TODO: Use getTargetVShiftByConstNode.
11848 unsigned ShlAmt = RotateAmt;
11849 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11850 V1 = DAG.getBitcast(RotateVT, V1);
11851 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11852 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11853 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11854 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11855 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11856 return DAG.getBitcast(VT, Rot);
11857 }
11858
11859 SDValue Rot =
11860 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11861 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11862 return DAG.getBitcast(VT, Rot);
11863}
11864
11865/// Try to match a vector shuffle as an element rotation.
11866///
11867/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11869 ArrayRef<int> Mask) {
11870 int NumElts = Mask.size();
11871
11872 // We need to detect various ways of spelling a rotation:
11873 // [11, 12, 13, 14, 15, 0, 1, 2]
11874 // [-1, 12, 13, 14, -1, -1, 1, -1]
11875 // [-1, -1, -1, -1, -1, -1, 1, 2]
11876 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11877 // [-1, 4, 5, 6, -1, -1, 9, -1]
11878 // [-1, 4, 5, 6, -1, -1, -1, -1]
11879 int Rotation = 0;
11880 SDValue Lo, Hi;
11881 for (int i = 0; i < NumElts; ++i) {
11882 int M = Mask[i];
11883 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11884 "Unexpected mask index.");
11885 if (M < 0)
11886 continue;
11887
11888 // Determine where a rotated vector would have started.
11889 int StartIdx = i - (M % NumElts);
11890 if (StartIdx == 0)
11891 // The identity rotation isn't interesting, stop.
11892 return -1;
11893
11894 // If we found the tail of a vector the rotation must be the missing
11895 // front. If we found the head of a vector, it must be how much of the
11896 // head.
11897 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11898
11899 if (Rotation == 0)
11900 Rotation = CandidateRotation;
11901 else if (Rotation != CandidateRotation)
11902 // The rotations don't match, so we can't match this mask.
11903 return -1;
11904
11905 // Compute which value this mask is pointing at.
11906 SDValue MaskV = M < NumElts ? V1 : V2;
11907
11908 // Compute which of the two target values this index should be assigned
11909 // to. This reflects whether the high elements are remaining or the low
11910 // elements are remaining.
11911 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11912
11913 // Either set up this value if we've not encountered it before, or check
11914 // that it remains consistent.
11915 if (!TargetV)
11916 TargetV = MaskV;
11917 else if (TargetV != MaskV)
11918 // This may be a rotation, but it pulls from the inputs in some
11919 // unsupported interleaving.
11920 return -1;
11921 }
11922
11923 // Check that we successfully analyzed the mask, and normalize the results.
11924 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11925 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11926 if (!Lo)
11927 Lo = Hi;
11928 else if (!Hi)
11929 Hi = Lo;
11930
11931 V1 = Lo;
11932 V2 = Hi;
11933
11934 return Rotation;
11935}
11936
11937/// Try to lower a vector shuffle as a byte rotation.
11938///
11939/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11940/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11941/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11942/// try to generically lower a vector shuffle through such an pattern. It
11943/// does not check for the profitability of lowering either as PALIGNR or
11944/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11945/// This matches shuffle vectors that look like:
11946///
11947/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11948///
11949/// Essentially it concatenates V1 and V2, shifts right by some number of
11950/// elements, and takes the low elements as the result. Note that while this is
11951/// specified as a *right shift* because x86 is little-endian, it is a *left
11952/// rotate* of the vector lanes.
11954 ArrayRef<int> Mask) {
11955 // Don't accept any shuffles with zero elements.
11956 if (isAnyZero(Mask))
11957 return -1;
11958
11959 // PALIGNR works on 128-bit lanes.
11960 SmallVector<int, 16> RepeatedMask;
11961 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11962 return -1;
11963
11964 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11965 if (Rotation <= 0)
11966 return -1;
11967
11968 // PALIGNR rotates bytes, so we need to scale the
11969 // rotation based on how many bytes are in the vector lane.
11970 int NumElts = RepeatedMask.size();
11971 int Scale = 16 / NumElts;
11972 return Rotation * Scale;
11973}
11974
11976 SDValue V2, ArrayRef<int> Mask,
11977 const X86Subtarget &Subtarget,
11978 SelectionDAG &DAG) {
11979 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11980
11981 SDValue Lo = V1, Hi = V2;
11982 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11983 if (ByteRotation <= 0)
11984 return SDValue();
11985
11986 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11987 // PSLLDQ/PSRLDQ.
11988 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11989 Lo = DAG.getBitcast(ByteVT, Lo);
11990 Hi = DAG.getBitcast(ByteVT, Hi);
11991
11992 // SSSE3 targets can use the palignr instruction.
11993 if (Subtarget.hasSSSE3()) {
11994 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11995 "512-bit PALIGNR requires BWI instructions");
11996 return DAG.getBitcast(
11997 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11998 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11999 }
12000
12001 assert(VT.is128BitVector() &&
12002 "Rotate-based lowering only supports 128-bit lowering!");
12003 assert(Mask.size() <= 16 &&
12004 "Can shuffle at most 16 bytes in a 128-bit vector!");
12005 assert(ByteVT == MVT::v16i8 &&
12006 "SSE2 rotate lowering only needed for v16i8!");
12007
12008 // Default SSE2 implementation
12009 int LoByteShift = 16 - ByteRotation;
12010 int HiByteShift = ByteRotation;
12011
12012 SDValue LoShift =
12013 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
12014 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
12015 SDValue HiShift =
12016 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
12017 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
12018 return DAG.getBitcast(VT,
12019 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
12020}
12021
12022/// Try to lower a vector shuffle as a dword/qword rotation.
12023///
12024/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12025/// rotation of the concatenation of two vectors; This routine will
12026/// try to generically lower a vector shuffle through such an pattern.
12027///
12028/// Essentially it concatenates V1 and V2, shifts right by some number of
12029/// elements, and takes the low elements as the result. Note that while this is
12030/// specified as a *right shift* because x86 is little-endian, it is a *left
12031/// rotate* of the vector lanes.
12033 SDValue V2, ArrayRef<int> Mask,
12034 const APInt &Zeroable,
12035 const X86Subtarget &Subtarget,
12036 SelectionDAG &DAG) {
12037 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
12038 "Only 32-bit and 64-bit elements are supported!");
12039
12040 // 128/256-bit vectors are only supported with VLX.
12041 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
12042 && "VLX required for 128/256-bit vectors");
12043
12044 SDValue Lo = V1, Hi = V2;
12045 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12046 if (0 < Rotation)
12047 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12048 DAG.getTargetConstant(Rotation, DL, MVT::i8));
12049
12050 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
12051 // TODO: Pull this out as a matchShuffleAsElementShift helper?
12052 // TODO: We can probably make this more aggressive and use shift-pairs like
12053 // lowerShuffleAsByteShiftMask.
12054 unsigned NumElts = Mask.size();
12055 unsigned ZeroLo = Zeroable.countr_one();
12056 unsigned ZeroHi = Zeroable.countl_one();
12057 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
12058 if (!ZeroLo && !ZeroHi)
12059 return SDValue();
12060
12061 if (ZeroLo) {
12062 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12063 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
12064 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
12065 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
12066 getZeroVector(VT, Subtarget, DAG, DL),
12067 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
12068 }
12069
12070 if (ZeroHi) {
12071 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
12072 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
12073 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
12074 return DAG.getNode(X86ISD::VALIGN, DL, VT,
12075 getZeroVector(VT, Subtarget, DAG, DL), Src,
12076 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
12077 }
12078
12079 return SDValue();
12080}
12081
12082/// Try to lower a vector shuffle as a byte shift sequence.
12084 SDValue V2, ArrayRef<int> Mask,
12085 const APInt &Zeroable,
12086 const X86Subtarget &Subtarget,
12087 SelectionDAG &DAG) {
12088 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12089 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
12090
12091 // We need a shuffle that has zeros at one/both ends and a sequential
12092 // shuffle from one source within.
12093 unsigned ZeroLo = Zeroable.countr_one();
12094 unsigned ZeroHi = Zeroable.countl_one();
12095 if (!ZeroLo && !ZeroHi)
12096 return SDValue();
12097
12098 unsigned NumElts = Mask.size();
12099 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12100 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12101 return SDValue();
12102
12103 unsigned Scale = VT.getScalarSizeInBits() / 8;
12104 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12105 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12106 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12107 return SDValue();
12108
12109 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12110 Res = DAG.getBitcast(MVT::v16i8, Res);
12111
12112 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12113 // inner sequential set of elements, possibly offset:
12114 // 01234567 --> zzzzzz01 --> 1zzzzzzz
12115 // 01234567 --> 4567zzzz --> zzzzz456
12116 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12117 if (ZeroLo == 0) {
12118 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12119 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12120 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12121 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12122 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12123 } else if (ZeroHi == 0) {
12124 unsigned Shift = Mask[ZeroLo] % NumElts;
12125 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12126 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12127 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12128 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12129 } else if (!Subtarget.hasSSSE3()) {
12130 // If we don't have PSHUFB then its worth avoiding an AND constant mask
12131 // by performing 3 byte shifts. Shuffle combining can kick in above that.
12132 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12133 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12134 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12135 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12136 Shift += Mask[ZeroLo] % NumElts;
12137 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12138 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12139 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12140 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12141 } else
12142 return SDValue();
12143
12144 return DAG.getBitcast(VT, Res);
12145}
12146
12147/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12148///
12149/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12150/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12151/// matches elements from one of the input vectors shuffled to the left or
12152/// right with zeroable elements 'shifted in'. It handles both the strictly
12153/// bit-wise element shifts and the byte shift across an entire 128-bit double
12154/// quad word lane.
12155///
12156/// PSHL : (little-endian) left bit shift.
12157/// [ zz, 0, zz, 2 ]
12158/// [ -1, 4, zz, -1 ]
12159/// PSRL : (little-endian) right bit shift.
12160/// [ 1, zz, 3, zz]
12161/// [ -1, -1, 7, zz]
12162/// PSLLDQ : (little-endian) left byte shift
12163/// [ zz, 0, 1, 2, 3, 4, 5, 6]
12164/// [ zz, zz, -1, -1, 2, 3, 4, -1]
12165/// [ zz, zz, zz, zz, zz, zz, -1, 1]
12166/// PSRLDQ : (little-endian) right byte shift
12167/// [ 5, 6, 7, zz, zz, zz, zz, zz]
12168/// [ -1, 5, 6, 7, zz, zz, zz, zz]
12169/// [ 1, 2, -1, -1, -1, -1, zz, zz]
12170static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12171 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12172 int MaskOffset, const APInt &Zeroable,
12173 const X86Subtarget &Subtarget) {
12174 int Size = Mask.size();
12175 unsigned SizeInBits = Size * ScalarSizeInBits;
12176
12177 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12178 for (int i = 0; i < Size; i += Scale)
12179 for (int j = 0; j < Shift; ++j)
12180 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12181 return false;
12182
12183 return true;
12184 };
12185
12186 auto MatchShift = [&](int Shift, int Scale, bool Left) {
12187 for (int i = 0; i != Size; i += Scale) {
12188 unsigned Pos = Left ? i + Shift : i;
12189 unsigned Low = Left ? i : i + Shift;
12190 unsigned Len = Scale - Shift;
12191 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12192 return -1;
12193 }
12194
12195 int ShiftEltBits = ScalarSizeInBits * Scale;
12196 bool ByteShift = ShiftEltBits > 64;
12197 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12198 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12199 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12200
12201 // Normalize the scale for byte shifts to still produce an i64 element
12202 // type.
12203 Scale = ByteShift ? Scale / 2 : Scale;
12204
12205 // We need to round trip through the appropriate type for the shift.
12206 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12207 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12208 : MVT::getVectorVT(ShiftSVT, Size / Scale);
12209 return (int)ShiftAmt;
12210 };
12211
12212 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12213 // keep doubling the size of the integer elements up to that. We can
12214 // then shift the elements of the integer vector by whole multiples of
12215 // their width within the elements of the larger integer vector. Test each
12216 // multiple to see if we can find a match with the moved element indices
12217 // and that the shifted in elements are all zeroable.
12218 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12219 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12220 for (int Shift = 1; Shift != Scale; ++Shift)
12221 for (bool Left : {true, false})
12222 if (CheckZeros(Shift, Scale, Left)) {
12223 int ShiftAmt = MatchShift(Shift, Scale, Left);
12224 if (0 < ShiftAmt)
12225 return ShiftAmt;
12226 }
12227
12228 // no match
12229 return -1;
12230}
12231
12233 SDValue V2, ArrayRef<int> Mask,
12234 const APInt &Zeroable,
12235 const X86Subtarget &Subtarget,
12236 SelectionDAG &DAG, bool BitwiseOnly) {
12237 int Size = Mask.size();
12238 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12239
12240 MVT ShiftVT;
12241 SDValue V = V1;
12242 unsigned Opcode;
12243
12244 // Try to match shuffle against V1 shift.
12245 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12246 Mask, 0, Zeroable, Subtarget);
12247
12248 // If V1 failed, try to match shuffle against V2 shift.
12249 if (ShiftAmt < 0) {
12250 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12251 Mask, Size, Zeroable, Subtarget);
12252 V = V2;
12253 }
12254
12255 if (ShiftAmt < 0)
12256 return SDValue();
12257
12258 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
12259 return SDValue();
12260
12261 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
12262 "Illegal integer vector type");
12263 V = DAG.getBitcast(ShiftVT, V);
12264 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12265 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12266 return DAG.getBitcast(VT, V);
12267}
12268
12269// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12270// Remainder of lower half result is zero and upper half is all undef.
12271static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12272 ArrayRef<int> Mask, uint64_t &BitLen,
12273 uint64_t &BitIdx, const APInt &Zeroable) {
12274 int Size = Mask.size();
12275 int HalfSize = Size / 2;
12276 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12277 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
12278
12279 // Upper half must be undefined.
12280 if (!isUndefUpperHalf(Mask))
12281 return false;
12282
12283 // Determine the extraction length from the part of the
12284 // lower half that isn't zeroable.
12285 int Len = HalfSize;
12286 for (; Len > 0; --Len)
12287 if (!Zeroable[Len - 1])
12288 break;
12289 assert(Len > 0 && "Zeroable shuffle mask");
12290
12291 // Attempt to match first Len sequential elements from the lower half.
12292 SDValue Src;
12293 int Idx = -1;
12294 for (int i = 0; i != Len; ++i) {
12295 int M = Mask[i];
12296 if (M == SM_SentinelUndef)
12297 continue;
12298 SDValue &V = (M < Size ? V1 : V2);
12299 M = M % Size;
12300
12301 // The extracted elements must start at a valid index and all mask
12302 // elements must be in the lower half.
12303 if (i > M || M >= HalfSize)
12304 return false;
12305
12306 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12307 Src = V;
12308 Idx = M - i;
12309 continue;
12310 }
12311 return false;
12312 }
12313
12314 if (!Src || Idx < 0)
12315 return false;
12316
12317 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12318 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12319 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12320 V1 = Src;
12321 return true;
12322}
12323
12324// INSERTQ: Extract lowest Len elements from lower half of second source and
12325// insert over first source, starting at Idx.
12326// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12327static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12328 ArrayRef<int> Mask, uint64_t &BitLen,
12329 uint64_t &BitIdx) {
12330 int Size = Mask.size();
12331 int HalfSize = Size / 2;
12332 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12333
12334 // Upper half must be undefined.
12335 if (!isUndefUpperHalf(Mask))
12336 return false;
12337
12338 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12339 SDValue Base;
12340
12341 // Attempt to match first source from mask before insertion point.
12342 if (isUndefInRange(Mask, 0, Idx)) {
12343 /* EMPTY */
12344 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12345 Base = V1;
12346 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12347 Base = V2;
12348 } else {
12349 continue;
12350 }
12351
12352 // Extend the extraction length looking to match both the insertion of
12353 // the second source and the remaining elements of the first.
12354 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12355 SDValue Insert;
12356 int Len = Hi - Idx;
12357
12358 // Match insertion.
12359 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12360 Insert = V1;
12361 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12362 Insert = V2;
12363 } else {
12364 continue;
12365 }
12366
12367 // Match the remaining elements of the lower half.
12368 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12369 /* EMPTY */
12370 } else if ((!Base || (Base == V1)) &&
12371 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12372 Base = V1;
12373 } else if ((!Base || (Base == V2)) &&
12374 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12375 Size + Hi)) {
12376 Base = V2;
12377 } else {
12378 continue;
12379 }
12380
12381 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12382 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12383 V1 = Base;
12384 V2 = Insert;
12385 return true;
12386 }
12387 }
12388
12389 return false;
12390}
12391
12392/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12394 SDValue V2, ArrayRef<int> Mask,
12395 const APInt &Zeroable, SelectionDAG &DAG) {
12396 uint64_t BitLen, BitIdx;
12397 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12398 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12399 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12400 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12401
12402 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12403 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12404 V2 ? V2 : DAG.getUNDEF(VT),
12405 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12406 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12407
12408 return SDValue();
12409}
12410
12411/// Lower a vector shuffle as an any/signed/zero extension.
12412///
12413/// Given a specific number of elements, element bit width, and extension
12414/// stride, produce either an extension based on the available
12415/// features of the subtarget. The extended elements are consecutive and
12416/// begin and can start from an offsetted element index in the input; to
12417/// avoid excess shuffling the offset must either being in the bottom lane
12418/// or at the start of a higher lane. All extended elements must be from
12419/// the same lane.
12421 int Scale, int Offset,
12422 unsigned ExtOpc, SDValue InputV,
12423 ArrayRef<int> Mask,
12424 const X86Subtarget &Subtarget,
12425 SelectionDAG &DAG) {
12426 assert(Scale > 1 && "Need a scale to extend.");
12427 assert(ISD::isExtOpcode(ExtOpc) && "Unsupported extension");
12428 int EltBits = VT.getScalarSizeInBits();
12429 int NumElements = VT.getVectorNumElements();
12430 int NumEltsPerLane = 128 / EltBits;
12431 int OffsetLane = Offset / NumEltsPerLane;
12432 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12433 "Only 8, 16, and 32 bit elements can be extended.");
12434 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
12435 assert(0 <= Offset && "Extension offset must be positive.");
12436 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
12437 "Extension offset must be in the first lane or start an upper lane.");
12438
12439 // Check that an index is in same lane as the base offset.
12440 auto SafeOffset = [&](int Idx) {
12441 return OffsetLane == (Idx / NumEltsPerLane);
12442 };
12443
12444 // Shift along an input so that the offset base moves to the first element.
12445 auto ShuffleOffset = [&](SDValue V) {
12446 if (!Offset)
12447 return V;
12448
12449 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12450 for (int i = 0; i * Scale < NumElements; ++i) {
12451 int SrcIdx = i + Offset;
12452 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12453 }
12454 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
12455 };
12456
12457 // Found a valid a/zext mask! Try various lowering strategies based on the
12458 // input type and available ISA extensions.
12459 if (Subtarget.hasSSE41()) {
12460 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12461 // PUNPCK will catch this in a later shuffle match.
12462 if (Offset && Scale == 2 && VT.is128BitVector())
12463 return SDValue();
12464 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
12465 NumElements / Scale);
12466 InputV = DAG.getBitcast(VT, InputV);
12467 InputV = ShuffleOffset(InputV);
12468 InputV = getEXTEND_VECTOR_INREG(ExtOpc, DL, ExtVT, InputV, DAG);
12469 return DAG.getBitcast(VT, InputV);
12470 }
12471
12472 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
12473 InputV = DAG.getBitcast(VT, InputV);
12474 bool AnyExt = ExtOpc == ISD::ANY_EXTEND;
12475
12476 // TODO: Add pre-SSE41 SIGN_EXTEND_VECTOR_INREG handling.
12477 if (ExtOpc == ISD::SIGN_EXTEND)
12478 return SDValue();
12479
12480 // For any extends we can cheat for larger element sizes and use shuffle
12481 // instructions that can fold with a load and/or copy.
12482 if (AnyExt && EltBits == 32) {
12483 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12484 -1};
12485 return DAG.getBitcast(
12486 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12487 DAG.getBitcast(MVT::v4i32, InputV),
12488 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12489 }
12490 if (AnyExt && EltBits == 16 && Scale > 2) {
12491 int PSHUFDMask[4] = {Offset / 2, -1,
12492 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12493 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12494 DAG.getBitcast(MVT::v4i32, InputV),
12495 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12496 int PSHUFWMask[4] = {1, -1, -1, -1};
12497 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
12498 return DAG.getBitcast(
12499 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
12500 DAG.getBitcast(MVT::v8i16, InputV),
12501 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
12502 }
12503
12504 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
12505 // to 64-bits.
12506 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12507 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
12508 assert(VT.is128BitVector() && "Unexpected vector width!");
12509
12510 int LoIdx = Offset * EltBits;
12511 SDValue Lo = DAG.getBitcast(
12512 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12513 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12514 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12515
12516 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12517 return DAG.getBitcast(VT, Lo);
12518
12519 int HiIdx = (Offset + 1) * EltBits;
12520 SDValue Hi = DAG.getBitcast(
12521 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12522 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12523 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
12524 return DAG.getBitcast(VT,
12525 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
12526 }
12527
12528 // If this would require more than 2 unpack instructions to expand, use
12529 // pshufb when available. We can only use more than 2 unpack instructions
12530 // when zero extending i8 elements which also makes it easier to use pshufb.
12531 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
12532 assert(NumElements == 16 && "Unexpected byte vector width!");
12533 SDValue PSHUFBMask[16];
12534 for (int i = 0; i < 16; ++i) {
12535 int Idx = Offset + (i / Scale);
12536 if ((i % Scale == 0 && SafeOffset(Idx))) {
12537 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
12538 continue;
12539 }
12540 PSHUFBMask[i] =
12541 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
12542 }
12543 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12544 return DAG.getBitcast(
12545 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12546 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12547 }
12548
12549 // If we are extending from an offset, ensure we start on a boundary that
12550 // we can unpack from.
12551 int AlignToUnpack = Offset % (NumElements / Scale);
12552 if (AlignToUnpack) {
12553 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12554 for (int i = AlignToUnpack; i < NumElements; ++i)
12555 ShMask[i - AlignToUnpack] = i;
12556 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12557 Offset -= AlignToUnpack;
12558 }
12559
12560 // Otherwise emit a sequence of unpacks.
12561 do {
12562 unsigned UnpackLoHi = X86ISD::UNPCKL;
12563 if (Offset >= (NumElements / 2)) {
12564 UnpackLoHi = X86ISD::UNPCKH;
12565 Offset -= (NumElements / 2);
12566 }
12567
12568 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12569 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12570 : getZeroVector(InputVT, Subtarget, DAG, DL);
12571 InputV = DAG.getBitcast(InputVT, InputV);
12572 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12573 Scale /= 2;
12574 EltBits *= 2;
12575 NumElements /= 2;
12576 } while (Scale > 1);
12577 return DAG.getBitcast(VT, InputV);
12578}
12579
12580/// Try to lower a vector shuffle as a zero extension on any microarch.
12581///
12582/// This routine will try to do everything in its power to cleverly lower
12583/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12584/// check for the profitability of this lowering, it tries to aggressively
12585/// match this pattern. It will use all of the micro-architectural details it
12586/// can to emit an efficient lowering. It handles both blends with all-zero
12587/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12588/// masking out later).
12589///
12590/// The reason we have dedicated lowering for zext-style shuffles is that they
12591/// are both incredibly common and often quite performance sensitive.
12593 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12594 const APInt &Zeroable, const X86Subtarget &Subtarget,
12595 SelectionDAG &DAG) {
12596 int Bits = VT.getSizeInBits();
12597 int NumLanes = Bits / 128;
12598 int NumElements = VT.getVectorNumElements();
12599 int NumEltsPerLane = NumElements / NumLanes;
12600 assert(VT.getScalarSizeInBits() <= 32 &&
12601 "Exceeds 32-bit integer zero extension limit");
12602 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12603
12604 // Define a helper function to check a particular ext-scale and lower to it if
12605 // valid.
12606 auto Lower = [&](int Scale) -> SDValue {
12607 SDValue InputV;
12608 bool AnyExt = true;
12609 int Offset = 0;
12610 int Matches = 0;
12611 for (int i = 0; i < NumElements; ++i) {
12612 int M = Mask[i];
12613 if (M < 0)
12614 continue; // Valid anywhere but doesn't tell us anything.
12615 if (i % Scale != 0) {
12616 // Each of the extended elements need to be zeroable.
12617 if (!Zeroable[i])
12618 return SDValue();
12619
12620 // We no longer are in the anyext case.
12621 AnyExt = false;
12622 continue;
12623 }
12624
12625 // Each of the base elements needs to be consecutive indices into the
12626 // same input vector.
12627 SDValue V = M < NumElements ? V1 : V2;
12628 M = M % NumElements;
12629 if (!InputV) {
12630 InputV = V;
12631 Offset = M - (i / Scale);
12632 } else if (InputV != V)
12633 return SDValue(); // Flip-flopping inputs.
12634
12635 // Offset must start in the lowest 128-bit lane or at the start of an
12636 // upper lane.
12637 // FIXME: Is it ever worth allowing a negative base offset?
12638 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12639 (Offset % NumEltsPerLane) == 0))
12640 return SDValue();
12641
12642 // If we are offsetting, all referenced entries must come from the same
12643 // lane.
12644 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12645 return SDValue();
12646
12647 if ((M % NumElements) != (Offset + (i / Scale)))
12648 return SDValue(); // Non-consecutive strided elements.
12649 Matches++;
12650 }
12651
12652 // If we fail to find an input, we have a zero-shuffle which should always
12653 // have already been handled.
12654 // FIXME: Maybe handle this here in case during blending we end up with one?
12655 if (!InputV)
12656 return SDValue();
12657
12658 // If we are offsetting, don't extend if we only match a single input, we
12659 // can always do better by using a basic PSHUF or PUNPCK.
12660 if (Offset != 0 && Matches < 2)
12661 return SDValue();
12662
12663 unsigned ExtOpc = AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND;
12664 return lowerShuffleAsSpecificExtension(DL, VT, Scale, Offset, ExtOpc,
12665 InputV, Mask, Subtarget, DAG);
12666 };
12667
12668 // The widest scale possible for extending is to a 64-bit integer.
12669 assert(Bits % 64 == 0 &&
12670 "The number of bits in a vector must be divisible by 64 on x86!");
12671 int NumExtElements = Bits / 64;
12672
12673 // Each iteration, try extending the elements half as much, but into twice as
12674 // many elements.
12675 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12676 assert(NumElements % NumExtElements == 0 &&
12677 "The input vector size must be divisible by the extended size.");
12678 if (SDValue V = Lower(NumElements / NumExtElements))
12679 return V;
12680 }
12681
12682 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12683 if (Bits != 128)
12684 return SDValue();
12685
12686 // Returns one of the source operands if the shuffle can be reduced to a
12687 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12688 auto CanZExtLowHalf = [&]() {
12689 for (int i = NumElements / 2; i != NumElements; ++i)
12690 if (!Zeroable[i])
12691 return SDValue();
12692 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12693 return V1;
12694 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12695 return V2;
12696 return SDValue();
12697 };
12698
12699 if (SDValue V = CanZExtLowHalf()) {
12700 V = DAG.getBitcast(MVT::v2i64, V);
12701 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12702 return DAG.getBitcast(VT, V);
12703 }
12704
12705 // No viable ext lowering found.
12706 return SDValue();
12707}
12708
12709/// Try to get a scalar value for a specific element of a vector.
12710///
12711/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12713 SelectionDAG &DAG) {
12714 MVT VT = V.getSimpleValueType();
12715 MVT EltVT = VT.getVectorElementType();
12716 V = peekThroughBitcasts(V);
12717
12718 // If the bitcasts shift the element size, we can't extract an equivalent
12719 // element from it.
12720 MVT NewVT = V.getSimpleValueType();
12721 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12722 return SDValue();
12723
12724 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12725 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12726 // Ensure the scalar operand is the same size as the destination.
12727 // FIXME: Add support for scalar truncation where possible.
12728 SDValue S = V.getOperand(Idx);
12729 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12730 return DAG.getBitcast(EltVT, S);
12731 }
12732
12733 return SDValue();
12734}
12735
12736/// Helper to test for a load that can be folded with x86 shuffles.
12737///
12738/// This is particularly important because the set of instructions varies
12739/// significantly based on whether the operand is a load or not.
12741 return V.hasOneUse() &&
12743}
12744
12745template<typename T>
12746static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12747 T EltVT = VT.getScalarType();
12748 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
12749 (EltVT == MVT::f16 && !Subtarget.hasFP16());
12750}
12751
12752/// Try to lower insertion of a single element into a zero vector.
12753///
12754/// This is a common pattern that we have especially efficient patterns to lower
12755/// across all subtarget feature sets.
12757 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12758 const APInt &Zeroable, const X86Subtarget &Subtarget,
12759 SelectionDAG &DAG) {
12760 MVT ExtVT = VT;
12761 MVT EltVT = VT.getVectorElementType();
12762 unsigned NumElts = VT.getVectorNumElements();
12763 unsigned EltBits = VT.getScalarSizeInBits();
12764
12765 if (isSoftF16(EltVT, Subtarget))
12766 return SDValue();
12767
12768 int V2Index =
12769 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12770 Mask.begin();
12771 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12772 bool IsV1Zeroable = true;
12773 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12774 if (i != V2Index && !Zeroable[i]) {
12775 IsV1Zeroable = false;
12776 break;
12777 }
12778
12779 // Bail if a non-zero V1 isn't used in place.
12780 if (!IsV1Zeroable) {
12781 SmallVector<int, 8> V1Mask(Mask);
12782 V1Mask[V2Index] = -1;
12783 if (!isNoopShuffleMask(V1Mask))
12784 return SDValue();
12785 }
12786
12787 // Check for a single input from a SCALAR_TO_VECTOR node.
12788 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12789 // all the smarts here sunk into that routine. However, the current
12790 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12791 // vector shuffle lowering is dead.
12792 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12793 DAG);
12794 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12795 // We need to zext the scalar if it is smaller than an i32.
12796 V2S = DAG.getBitcast(EltVT, V2S);
12797 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12798 // Using zext to expand a narrow element won't work for non-zero
12799 // insertions. But we can use a masked constant vector if we're
12800 // inserting V2 into the bottom of V1.
12801 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12802 return SDValue();
12803
12804 // Zero-extend directly to i32.
12805 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12806 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12807
12808 // If we're inserting into a constant, mask off the inserted index
12809 // and OR with the zero-extended scalar.
12810 if (!IsV1Zeroable) {
12811 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12812 Bits[V2Index] = APInt::getZero(EltBits);
12813 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12814 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12815 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12816 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12817 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12818 }
12819 }
12820 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12821 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12822 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
12823 // Either not inserting from the low element of the input or the input
12824 // element size is too small to use VZEXT_MOVL to clear the high bits.
12825 return SDValue();
12826 }
12827
12828 if (!IsV1Zeroable) {
12829 // If V1 can't be treated as a zero vector we have fewer options to lower
12830 // this. We can't support integer vectors or non-zero targets cheaply.
12831 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12832 if (!VT.isFloatingPoint() || V2Index != 0)
12833 return SDValue();
12834 if (!VT.is128BitVector())
12835 return SDValue();
12836
12837 // Otherwise, use MOVSD, MOVSS or MOVSH.
12838 unsigned MovOpc = 0;
12839 if (EltVT == MVT::f16)
12840 MovOpc = X86ISD::MOVSH;
12841 else if (EltVT == MVT::f32)
12842 MovOpc = X86ISD::MOVSS;
12843 else if (EltVT == MVT::f64)
12844 MovOpc = X86ISD::MOVSD;
12845 else
12846 llvm_unreachable("Unsupported floating point element type to handle!");
12847 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12848 }
12849
12850 // This lowering only works for the low element with floating point vectors.
12851 if (VT.isFloatingPoint() && V2Index != 0)
12852 return SDValue();
12853
12854 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12855 if (ExtVT != VT)
12856 V2 = DAG.getBitcast(VT, V2);
12857
12858 if (V2Index != 0) {
12859 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12860 // the desired position. Otherwise it is more efficient to do a vector
12861 // shift left. We know that we can do a vector shift left because all
12862 // the inputs are zero.
12863 if (VT.isFloatingPoint() || NumElts <= 4) {
12864 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12865 V2Shuffle[V2Index] = 0;
12866 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12867 } else {
12868 V2 = DAG.getBitcast(MVT::v16i8, V2);
12869 V2 = DAG.getNode(
12870 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12871 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12872 V2 = DAG.getBitcast(VT, V2);
12873 }
12874 }
12875 return V2;
12876}
12877
12878/// Try to lower broadcast of a single - truncated - integer element,
12879/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12880///
12881/// This assumes we have AVX2.
12883 int BroadcastIdx,
12884 const X86Subtarget &Subtarget,
12885 SelectionDAG &DAG) {
12886 assert(Subtarget.hasAVX2() &&
12887 "We can only lower integer broadcasts with AVX2!");
12888
12889 MVT EltVT = VT.getVectorElementType();
12890 MVT V0VT = V0.getSimpleValueType();
12891
12892 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12893 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12894
12895 MVT V0EltVT = V0VT.getVectorElementType();
12896 if (!V0EltVT.isInteger())
12897 return SDValue();
12898
12899 const unsigned EltSize = EltVT.getSizeInBits();
12900 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12901
12902 // This is only a truncation if the original element type is larger.
12903 if (V0EltSize <= EltSize)
12904 return SDValue();
12905
12906 assert(((V0EltSize % EltSize) == 0) &&
12907 "Scalar type sizes must all be powers of 2 on x86!");
12908
12909 const unsigned V0Opc = V0.getOpcode();
12910 const unsigned Scale = V0EltSize / EltSize;
12911 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12912
12913 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12914 V0Opc != ISD::BUILD_VECTOR)
12915 return SDValue();
12916
12917 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12918
12919 // If we're extracting non-least-significant bits, shift so we can truncate.
12920 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12921 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12922 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12923 if (const int OffsetIdx = BroadcastIdx % Scale)
12924 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12925 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12926
12927 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12928 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12929}
12930
12931/// Test whether this can be lowered with a single SHUFPS instruction.
12932///
12933/// This is used to disable more specialized lowerings when the shufps lowering
12934/// will happen to be efficient.
12936 // This routine only handles 128-bit shufps.
12937 assert(Mask.size() == 4 && "Unsupported mask size!");
12938 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12939 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12940 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12941 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12942
12943 // To lower with a single SHUFPS we need to have the low half and high half
12944 // each requiring a single input.
12945 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12946 return false;
12947 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12948 return false;
12949
12950 return true;
12951}
12952
12953/// Test whether the specified input (0 or 1) is in-place blended by the
12954/// given mask.
12955///
12956/// This returns true if the elements from a particular input are already in the
12957/// slot required by the given mask and require no permutation.
12959 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12960 int Size = Mask.size();
12961 for (int i = 0; i < Size; ++i)
12962 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12963 return false;
12964
12965 return true;
12966}
12967
12968/// Test whether the specified input (0 or 1) is a broadcast/splat blended by
12969/// the given mask.
12970///
12972 int BroadcastableElement = 0) {
12973 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12974 int Size = Mask.size();
12975 for (int i = 0; i < Size; ++i)
12976 if (Mask[i] >= 0 && Mask[i] / Size == Input &&
12977 Mask[i] % Size != BroadcastableElement)
12978 return false;
12979 return true;
12980}
12981
12982/// If we are extracting two 128-bit halves of a vector and shuffling the
12983/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12984/// multi-shuffle lowering.
12986 SDValue N1, ArrayRef<int> Mask,
12987 SelectionDAG &DAG) {
12988 MVT VT = N0.getSimpleValueType();
12989 assert((VT.is128BitVector() &&
12990 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12991 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12992
12993 // Check that both sources are extracts of the same source vector.
12994 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12996 N0.getOperand(0) != N1.getOperand(0) ||
12997 !N0.hasOneUse() || !N1.hasOneUse())
12998 return SDValue();
12999
13000 SDValue WideVec = N0.getOperand(0);
13001 MVT WideVT = WideVec.getSimpleValueType();
13002 if (!WideVT.is256BitVector())
13003 return SDValue();
13004
13005 // Match extracts of each half of the wide source vector. Commute the shuffle
13006 // if the extract of the low half is N1.
13007 unsigned NumElts = VT.getVectorNumElements();
13008 SmallVector<int, 4> NewMask(Mask);
13009 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
13010 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
13011 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13013 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13014 return SDValue();
13015
13016 // Final bailout: if the mask is simple, we are better off using an extract
13017 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
13018 // because that avoids a constant load from memory.
13019 if (NumElts == 4 &&
13020 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
13021 return SDValue();
13022
13023 // Extend the shuffle mask with undef elements.
13024 NewMask.append(NumElts, -1);
13025
13026 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13027 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13028 NewMask);
13029 // This is free: ymm -> xmm.
13030 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13031 DAG.getVectorIdxConstant(0, DL));
13032}
13033
13034/// Try to lower broadcast of a single element.
13035///
13036/// For convenience, this code also bundles all of the subtarget feature set
13037/// filtering. While a little annoying to re-dispatch on type here, there isn't
13038/// a convenient way to factor it out.
13040 SDValue V2, ArrayRef<int> Mask,
13041 const X86Subtarget &Subtarget,
13042 SelectionDAG &DAG) {
13043 MVT EltVT = VT.getVectorElementType();
13044 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13045 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13046 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
13047 return SDValue();
13048
13049 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13050 // we can only broadcast from a register with AVX2.
13051 unsigned NumEltBits = VT.getScalarSizeInBits();
13052 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13055 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13056
13057 // Check that the mask is a broadcast.
13058 int BroadcastIdx = getSplatIndex(Mask);
13059 if (BroadcastIdx < 0) {
13060 // Check for hidden broadcast.
13061 SmallVector<int, 16> BroadcastMask(VT.getVectorNumElements(), 0);
13062 if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2))
13063 return SDValue();
13064 BroadcastIdx = 0;
13065 }
13066 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
13067 "a sorted mask where the broadcast "
13068 "comes from V1.");
13069 int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
13070
13071 // Go up the chain of (vector) values to find a scalar load that we can
13072 // combine with the broadcast.
13073 // TODO: Combine this logic with findEltLoadSrc() used by
13074 // EltsFromConsecutiveLoads().
13075 int BitOffset = BroadcastIdx * NumEltBits;
13076 SDValue V = V1;
13077 for (;;) {
13078 switch (V.getOpcode()) {
13079 case ISD::BITCAST: {
13080 V = V.getOperand(0);
13081 continue;
13082 }
13083 case ISD::CONCAT_VECTORS: {
13084 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13085 int OpIdx = BitOffset / OpBitWidth;
13086 V = V.getOperand(OpIdx);
13087 BitOffset %= OpBitWidth;
13088 continue;
13089 }
13091 // The extraction index adds to the existing offset.
13092 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13093 unsigned Idx = V.getConstantOperandVal(1);
13094 unsigned BeginOffset = Idx * EltBitWidth;
13095 BitOffset += BeginOffset;
13096 V = V.getOperand(0);
13097 continue;
13098 }
13099 case ISD::INSERT_SUBVECTOR: {
13100 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13101 int EltBitWidth = VOuter.getScalarValueSizeInBits();
13102 int Idx = (int)V.getConstantOperandVal(2);
13103 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13104 int BeginOffset = Idx * EltBitWidth;
13105 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13106 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13107 BitOffset -= BeginOffset;
13108 V = VInner;
13109 } else {
13110 V = VOuter;
13111 }
13112 continue;
13113 }
13114 }
13115 break;
13116 }
13117 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
13118 BroadcastIdx = BitOffset / NumEltBits;
13119
13120 // Do we need to bitcast the source to retrieve the original broadcast index?
13121 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13122
13123 // Check if this is a broadcast of a scalar. We special case lowering
13124 // for scalars so that we can more effectively fold with loads.
13125 // If the original value has a larger element type than the shuffle, the
13126 // broadcast element is in essence truncated. Make that explicit to ease
13127 // folding.
13128 if (BitCastSrc && VT.isInteger())
13129 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13130 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13131 return TruncBroadcast;
13132
13133 // Also check the simpler case, where we can directly reuse the scalar.
13134 if (!BitCastSrc &&
13135 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13136 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13137 V = V.getOperand(BroadcastIdx);
13138
13139 // If we can't broadcast from a register, check that the input is a load.
13140 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13141 return SDValue();
13142 } else if (ISD::isNormalLoad(V.getNode()) &&
13143 cast<LoadSDNode>(V)->isSimple()) {
13144 // We do not check for one-use of the vector load because a broadcast load
13145 // is expected to be a win for code size, register pressure, and possibly
13146 // uops even if the original vector load is not eliminated.
13147
13148 // Reduce the vector load and shuffle to a broadcasted scalar load.
13149 auto *Ld = cast<LoadSDNode>(V);
13150 SDValue BaseAddr = Ld->getBasePtr();
13151 MVT SVT = VT.getScalarType();
13152 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13153 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
13154 SDValue NewAddr =
13156
13157 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13158 // than MOVDDUP.
13159 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13160 if (Opcode == X86ISD::VBROADCAST) {
13161 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13162 SDValue Ops[] = {Ld->getChain(), NewAddr};
13163 V = DAG.getMemIntrinsicNode(
13164 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13166 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13168 return DAG.getBitcast(VT, V);
13169 }
13170 assert(SVT == MVT::f64 && "Unexpected VT!");
13171 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13173 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13175 } else if (!BroadcastFromReg) {
13176 // We can't broadcast from a vector register.
13177 return SDValue();
13178 } else if (BitOffset != 0) {
13179 // We can only broadcast from the zero-element of a vector register,
13180 // but it can be advantageous to broadcast from the zero-element of a
13181 // subvector.
13182 if (!VT.is256BitVector() && !VT.is512BitVector())
13183 return SDValue();
13184
13185 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13186 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13187 return SDValue();
13188
13189 // If we are broadcasting an element from the lowest 128-bit subvector, try
13190 // to move the element in position.
13191 if (BitOffset < 128 && NumActiveElts > 1 &&
13192 V.getScalarValueSizeInBits() == NumEltBits) {
13193 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13194 "Unexpected bit-offset");
13195 SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
13196 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
13197 V = extractSubVector(V, 0, DAG, DL, 128);
13198 V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
13199 } else {
13200 // Only broadcast the zero-element of a 128-bit subvector.
13201 if ((BitOffset % 128) != 0)
13202 return SDValue();
13203
13204 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13205 "Unexpected bit-offset");
13206 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13207 "Unexpected vector size");
13208 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13209 V = extract128BitVector(V, ExtractIdx, DAG, DL);
13210 }
13211 }
13212
13213 // On AVX we can use VBROADCAST directly for scalar sources.
13214 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13215 V = DAG.getBitcast(MVT::f64, V);
13216 if (Subtarget.hasAVX()) {
13217 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13218 return DAG.getBitcast(VT, V);
13219 }
13220 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13221 }
13222
13223 // If this is a scalar, do the broadcast on this type and bitcast.
13224 if (!V.getValueType().isVector()) {
13225 assert(V.getScalarValueSizeInBits() == NumEltBits &&
13226 "Unexpected scalar size");
13227 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13229 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13230 }
13231
13232 // We only support broadcasting from 128-bit vectors to minimize the
13233 // number of patterns we need to deal with in isel. So extract down to
13234 // 128-bits, removing as many bitcasts as possible.
13235 if (V.getValueSizeInBits() > 128)
13237
13238 // Otherwise cast V to a vector with the same element type as VT, but
13239 // possibly narrower than VT. Then perform the broadcast.
13240 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13241 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13242 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13243}
13244
13245// Check for whether we can use INSERTPS to perform the shuffle. We only use
13246// INSERTPS when the V1 elements are already in the correct locations
13247// because otherwise we can just always use two SHUFPS instructions which
13248// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13249// perform INSERTPS if a single V1 element is out of place and all V2
13250// elements are zeroable.
13252 unsigned &InsertPSMask,
13253 const APInt &Zeroable,
13254 ArrayRef<int> Mask, SelectionDAG &DAG) {
13255 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
13256 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
13257 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13258
13259 // Attempt to match INSERTPS with one element from VA or VB being
13260 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13261 // are updated.
13262 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13263 ArrayRef<int> CandidateMask) {
13264 unsigned ZMask = 0;
13265 int VADstIndex = -1;
13266 int VBDstIndex = -1;
13267 bool VAUsedInPlace = false;
13268
13269 for (int i = 0; i < 4; ++i) {
13270 // Synthesize a zero mask from the zeroable elements (includes undefs).
13271 if (Zeroable[i]) {
13272 ZMask |= 1 << i;
13273 continue;
13274 }
13275
13276 // Flag if we use any VA inputs in place.
13277 if (i == CandidateMask[i]) {
13278 VAUsedInPlace = true;
13279 continue;
13280 }
13281
13282 // We can only insert a single non-zeroable element.
13283 if (VADstIndex >= 0 || VBDstIndex >= 0)
13284 return false;
13285
13286 if (CandidateMask[i] < 4) {
13287 // VA input out of place for insertion.
13288 VADstIndex = i;
13289 } else {
13290 // VB input for insertion.
13291 VBDstIndex = i;
13292 }
13293 }
13294
13295 // Don't bother if we have no (non-zeroable) element for insertion.
13296 if (VADstIndex < 0 && VBDstIndex < 0)
13297 return false;
13298
13299 // Determine element insertion src/dst indices. The src index is from the
13300 // start of the inserted vector, not the start of the concatenated vector.
13301 unsigned VBSrcIndex = 0;
13302 if (VADstIndex >= 0) {
13303 // If we have a VA input out of place, we use VA as the V2 element
13304 // insertion and don't use the original V2 at all.
13305 VBSrcIndex = CandidateMask[VADstIndex];
13306 VBDstIndex = VADstIndex;
13307 VB = VA;
13308 } else {
13309 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13310 }
13311
13312 // If no V1 inputs are used in place, then the result is created only from
13313 // the zero mask and the V2 insertion - so remove V1 dependency.
13314 if (!VAUsedInPlace)
13315 VA = DAG.getUNDEF(MVT::v4f32);
13316
13317 // Update V1, V2 and InsertPSMask accordingly.
13318 V1 = VA;
13319 V2 = VB;
13320
13321 // Insert the V2 element into the desired position.
13322 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13323 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13324 return true;
13325 };
13326
13327 if (matchAsInsertPS(V1, V2, Mask))
13328 return true;
13329
13330 // Commute and try again.
13331 SmallVector<int, 4> CommutedMask(Mask);
13333 if (matchAsInsertPS(V2, V1, CommutedMask))
13334 return true;
13335
13336 return false;
13337}
13338
13340 ArrayRef<int> Mask, const APInt &Zeroable,
13341 SelectionDAG &DAG) {
13342 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13343 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13344
13345 // Attempt to match the insertps pattern.
13346 unsigned InsertPSMask = 0;
13347 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13348 return SDValue();
13349
13350 // Insert the V2 element into the desired position.
13351 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13352 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13353}
13354
13355/// Handle lowering of 2-lane 64-bit floating point shuffles.
13356///
13357/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13358/// support for floating point shuffles but not integer shuffles. These
13359/// instructions will incur a domain crossing penalty on some chips though so
13360/// it is better to avoid lowering through this for integer vectors where
13361/// possible.
13363 const APInt &Zeroable, SDValue V1, SDValue V2,
13364 const X86Subtarget &Subtarget,
13365 SelectionDAG &DAG) {
13366 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13367 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13368 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13369
13370 if (V2.isUndef()) {
13371 // Check for being able to broadcast a single element.
13372 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13373 Mask, Subtarget, DAG))
13374 return Broadcast;
13375
13376 // Straight shuffle of a single input vector. Simulate this by using the
13377 // single input as both of the "inputs" to this instruction..
13378 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13379
13380 if (Subtarget.hasAVX()) {
13381 // If we have AVX, we can use VPERMILPS which will allow folding a load
13382 // into the shuffle.
13383 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13384 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13385 }
13386
13387 return DAG.getNode(
13388 X86ISD::SHUFP, DL, MVT::v2f64,
13389 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13390 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13391 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13392 }
13393 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13394 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13395 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13396 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13397
13398 if (Subtarget.hasAVX2())
13399 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13400 return Extract;
13401
13402 // When loading a scalar and then shuffling it into a vector we can often do
13403 // the insertion cheaply.
13405 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13406 return Insertion;
13407 // Try inverting the insertion since for v2 masks it is easy to do and we
13408 // can't reliably sort the mask one way or the other.
13409 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13410 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13412 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13413 return Insertion;
13414
13415 // Try to use one of the special instruction patterns to handle two common
13416 // blend patterns if a zero-blend above didn't work.
13417 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
13418 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
13419 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13420 // We can either use a special instruction to load over the low double or
13421 // to move just the low double.
13422 return DAG.getNode(
13423 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13424 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13425
13426 if (Subtarget.hasSSE41())
13427 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13428 Zeroable, Subtarget, DAG))
13429 return Blend;
13430
13431 // Use dedicated unpack instructions for masks that match their pattern.
13432 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG))
13433 return V;
13434
13435 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13436 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13437 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13438}
13439
13440/// Handle lowering of 2-lane 64-bit integer shuffles.
13441///
13442/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13443/// the integer unit to minimize domain crossing penalties. However, for blends
13444/// it falls back to the floating point shuffle operation with appropriate bit
13445/// casting.
13447 const APInt &Zeroable, SDValue V1, SDValue V2,
13448 const X86Subtarget &Subtarget,
13449 SelectionDAG &DAG) {
13450 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13451 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13452 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13453
13454 if (V2.isUndef()) {
13455 // Check for being able to broadcast a single element.
13456 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13457 Mask, Subtarget, DAG))
13458 return Broadcast;
13459
13460 // Straight shuffle of a single input vector. For everything from SSE2
13461 // onward this has a single fast instruction with no scary immediates.
13462 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13463 V1 = DAG.getBitcast(MVT::v4i32, V1);
13464 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13465 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13466 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13467 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13468 return DAG.getBitcast(
13469 MVT::v2i64,
13470 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13471 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13472 }
13473 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
13474 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
13475 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13476 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13477
13478 if (Subtarget.hasAVX2())
13479 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13480 return Extract;
13481
13482 // Try to use shift instructions.
13483 if (SDValue Shift =
13484 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
13485 DAG, /*BitwiseOnly*/ false))
13486 return Shift;
13487
13488 // When loading a scalar and then shuffling it into a vector we can often do
13489 // the insertion cheaply.
13491 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13492 return Insertion;
13493 // Try inverting the insertion since for v2 masks it is easy to do and we
13494 // can't reliably sort the mask one way or the other.
13495 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13497 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13498 return Insertion;
13499
13500 // We have different paths for blend lowering, but they all must use the
13501 // *exact* same predicate.
13502 bool IsBlendSupported = Subtarget.hasSSE41();
13503 if (IsBlendSupported)
13504 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
13505 Zeroable, Subtarget, DAG))
13506 return Blend;
13507
13508 // Use dedicated unpack instructions for masks that match their pattern.
13509 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG))
13510 return V;
13511
13512 // Try to use byte rotation instructions.
13513 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13514 if (Subtarget.hasSSSE3()) {
13515 if (Subtarget.hasVLX())
13516 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
13517 Zeroable, Subtarget, DAG))
13518 return Rotate;
13519
13520 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
13521 Subtarget, DAG))
13522 return Rotate;
13523 }
13524
13525 // If we have direct support for blends, we should lower by decomposing into
13526 // a permute. That will be faster than the domain cross.
13527 if (IsBlendSupported)
13528 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
13529 Zeroable, Subtarget, DAG);
13530
13531 // We implement this with SHUFPD which is pretty lame because it will likely
13532 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
13533 // However, all the alternatives are still more cycles and newer chips don't
13534 // have this problem. It would be really nice if x86 had better shuffles here.
13535 V1 = DAG.getBitcast(MVT::v2f64, V1);
13536 V2 = DAG.getBitcast(MVT::v2f64, V2);
13537 return DAG.getBitcast(MVT::v2i64,
13538 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
13539}
13540
13541/// Lower a vector shuffle using the SHUFPS instruction.
13542///
13543/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
13544/// It makes no assumptions about whether this is the *best* lowering, it simply
13545/// uses it.
13547 ArrayRef<int> Mask, SDValue V1,
13548 SDValue V2, SelectionDAG &DAG) {
13549 SDValue LowV = V1, HighV = V2;
13550 SmallVector<int, 4> NewMask(Mask);
13551 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13552
13553 if (NumV2Elements == 1) {
13554 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13555
13556 // Compute the index adjacent to V2Index and in the same half by toggling
13557 // the low bit.
13558 int V2AdjIndex = V2Index ^ 1;
13559
13560 if (Mask[V2AdjIndex] < 0) {
13561 // Handles all the cases where we have a single V2 element and an undef.
13562 // This will only ever happen in the high lanes because we commute the
13563 // vector otherwise.
13564 if (V2Index < 2)
13565 std::swap(LowV, HighV);
13566 NewMask[V2Index] -= 4;
13567 } else {
13568 // Handle the case where the V2 element ends up adjacent to a V1 element.
13569 // To make this work, blend them together as the first step.
13570 int V1Index = V2AdjIndex;
13571 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13572 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
13573 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13574
13575 // Now proceed to reconstruct the final blend as we have the necessary
13576 // high or low half formed.
13577 if (V2Index < 2) {
13578 LowV = V2;
13579 HighV = V1;
13580 } else {
13581 HighV = V2;
13582 }
13583 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13584 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13585 }
13586 } else if (NumV2Elements == 2) {
13587 if (Mask[0] < 4 && Mask[1] < 4) {
13588 // Handle the easy case where we have V1 in the low lanes and V2 in the
13589 // high lanes.
13590 NewMask[2] -= 4;
13591 NewMask[3] -= 4;
13592 } else if (Mask[2] < 4 && Mask[3] < 4) {
13593 // We also handle the reversed case because this utility may get called
13594 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13595 // arrange things in the right direction.
13596 NewMask[0] -= 4;
13597 NewMask[1] -= 4;
13598 HighV = V1;
13599 LowV = V2;
13600 } else {
13601 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13602 // trying to place elements directly, just blend them and set up the final
13603 // shuffle to place them.
13604
13605 // The first two blend mask elements are for V1, the second two are for
13606 // V2.
13607 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13608 Mask[2] < 4 ? Mask[2] : Mask[3],
13609 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13610 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13611 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13612 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13613
13614 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13615 // a blend.
13616 LowV = HighV = V1;
13617 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13618 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13619 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13620 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13621 }
13622 } else if (NumV2Elements == 3) {
13623 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13624 // we can get here due to other paths (e.g repeated mask matching) that we
13625 // don't want to do another round of lowerVECTOR_SHUFFLE.
13627 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13628 }
13629 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13630 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13631}
13632
13633/// Lower 4-lane 32-bit floating point shuffles.
13634///
13635/// Uses instructions exclusively from the floating point unit to minimize
13636/// domain crossing penalties, as these are sufficient to implement all v4f32
13637/// shuffles.
13639 const APInt &Zeroable, SDValue V1, SDValue V2,
13640 const X86Subtarget &Subtarget,
13641 SelectionDAG &DAG) {
13642 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13643 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13644 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13645
13646 if (Subtarget.hasSSE41())
13647 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13648 Zeroable, Subtarget, DAG))
13649 return Blend;
13650
13651 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13652
13653 if (NumV2Elements == 0) {
13654 // Check for being able to broadcast a single element.
13655 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13656 Mask, Subtarget, DAG))
13657 return Broadcast;
13658
13659 // Use even/odd duplicate instructions for masks that match their pattern.
13660 if (Subtarget.hasSSE3()) {
13661 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13662 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13663 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13664 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13665 }
13666
13667 if (Subtarget.hasAVX()) {
13668 // If we have AVX, we can use VPERMILPS which will allow folding a load
13669 // into the shuffle.
13670 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13671 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13672 }
13673
13674 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13675 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13676 if (!Subtarget.hasSSE2()) {
13677 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13678 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13679 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13680 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13681 }
13682
13683 // Otherwise, use a straight shuffle of a single input vector. We pass the
13684 // input vector to both operands to simulate this with a SHUFPS.
13685 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13686 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13687 }
13688
13689 if (Subtarget.hasSSE2())
13691 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13692 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13693 return ZExt;
13694 }
13695
13696 if (Subtarget.hasAVX2())
13697 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13698 return Extract;
13699
13700 // There are special ways we can lower some single-element blends. However, we
13701 // have custom ways we can lower more complex single-element blends below that
13702 // we defer to if both this and BLENDPS fail to match, so restrict this to
13703 // when the V2 input is targeting element 0 of the mask -- that is the fast
13704 // case here.
13705 if (NumV2Elements == 1 && Mask[0] >= 4)
13707 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13708 return V;
13709
13710 if (Subtarget.hasSSE41()) {
13711 // Use INSERTPS if we can complete the shuffle efficiently.
13712 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13713 return V;
13714
13715 if (!isSingleSHUFPSMask(Mask))
13716 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13717 V2, Mask, DAG))
13718 return BlendPerm;
13719 }
13720
13721 // Use low/high mov instructions. These are only valid in SSE1 because
13722 // otherwise they are widened to v2f64 and never get here.
13723 if (!Subtarget.hasSSE2()) {
13724 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13725 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13726 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13727 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13728 }
13729
13730 // Use dedicated unpack instructions for masks that match their pattern.
13731 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG))
13732 return V;
13733
13734 // Otherwise fall back to a SHUFPS lowering strategy.
13735 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13736}
13737
13738/// Lower 4-lane i32 vector shuffles.
13739///
13740/// We try to handle these with integer-domain shuffles where we can, but for
13741/// blends we use the floating point domain blend instructions.
13743 const APInt &Zeroable, SDValue V1, SDValue V2,
13744 const X86Subtarget &Subtarget,
13745 SelectionDAG &DAG) {
13746 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13747 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13748 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13749
13750 // Whenever we can lower this as a zext, that instruction is strictly faster
13751 // than any alternative. It also allows us to fold memory operands into the
13752 // shuffle in many cases.
13753 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13754 Zeroable, Subtarget, DAG))
13755 return ZExt;
13756
13757 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13758
13759 // Try to use shift instructions if fast.
13760 if (Subtarget.preferLowerShuffleAsShift()) {
13761 if (SDValue Shift =
13762 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13763 Subtarget, DAG, /*BitwiseOnly*/ true))
13764 return Shift;
13765 if (NumV2Elements == 0)
13766 if (SDValue Rotate =
13767 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13768 return Rotate;
13769 }
13770
13771 if (NumV2Elements == 0) {
13772 // Try to use broadcast unless the mask only has one non-undef element.
13773 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13774 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13775 Mask, Subtarget, DAG))
13776 return Broadcast;
13777 }
13778
13779 // Straight shuffle of a single input vector. For everything from SSE2
13780 // onward this has a single fast instruction with no scary immediates.
13781 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13782 // but we aren't actually going to use the UNPCK instruction because doing
13783 // so prevents folding a load into this instruction or making a copy.
13784 const int UnpackLoMask[] = {0, 0, 1, 1};
13785 const int UnpackHiMask[] = {2, 2, 3, 3};
13786 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
13787 Mask = UnpackLoMask;
13788 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
13789 Mask = UnpackHiMask;
13790
13791 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13792 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13793 }
13794
13795 if (Subtarget.hasAVX2())
13796 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13797 return Extract;
13798
13799 // Try to use shift instructions.
13800 if (SDValue Shift =
13801 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
13802 DAG, /*BitwiseOnly*/ false))
13803 return Shift;
13804
13805 // There are special ways we can lower some single-element blends.
13806 if (NumV2Elements == 1)
13808 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13809 return V;
13810
13811 // We have different paths for blend lowering, but they all must use the
13812 // *exact* same predicate.
13813 bool IsBlendSupported = Subtarget.hasSSE41();
13814 if (IsBlendSupported)
13815 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13816 Zeroable, Subtarget, DAG))
13817 return Blend;
13818
13819 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13820 Zeroable, Subtarget, DAG))
13821 return Masked;
13822
13823 // Use dedicated unpack instructions for masks that match their pattern.
13824 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG))
13825 return V;
13826
13827 // Try to use byte rotation instructions.
13828 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13829 if (Subtarget.hasSSSE3()) {
13830 if (Subtarget.hasVLX())
13831 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13832 Zeroable, Subtarget, DAG))
13833 return Rotate;
13834
13835 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13836 Subtarget, DAG))
13837 return Rotate;
13838 }
13839
13840 // Assume that a single SHUFPS is faster than an alternative sequence of
13841 // multiple instructions (even if the CPU has a domain penalty).
13842 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13843 if (!isSingleSHUFPSMask(Mask)) {
13844 // If we have direct support for blends, we should lower by decomposing into
13845 // a permute. That will be faster than the domain cross.
13846 if (IsBlendSupported)
13847 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13848 Zeroable, Subtarget, DAG);
13849
13850 // Try to lower by permuting the inputs into an unpack instruction.
13851 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13852 Mask, Subtarget, DAG))
13853 return Unpack;
13854 }
13855
13856 // We implement this with SHUFPS because it can blend from two vectors.
13857 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13858 // up the inputs, bypassing domain shift penalties that we would incur if we
13859 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13860 // relevant.
13861 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13862 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13863 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13864 return DAG.getBitcast(MVT::v4i32, ShufPS);
13865}
13866
13867/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13868/// shuffle lowering, and the most complex part.
13869///
13870/// The lowering strategy is to try to form pairs of input lanes which are
13871/// targeted at the same half of the final vector, and then use a dword shuffle
13872/// to place them onto the right half, and finally unpack the paired lanes into
13873/// their final position.
13874///
13875/// The exact breakdown of how to form these dword pairs and align them on the
13876/// correct sides is really tricky. See the comments within the function for
13877/// more of the details.
13878///
13879/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13880/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13881/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13882/// vector, form the analogous 128-bit 8-element Mask.
13884 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13885 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13886 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13887 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13888
13889 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13890 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13891 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13892
13893 // Attempt to directly match PSHUFLW or PSHUFHW.
13894 if (isUndefOrInRange(LoMask, 0, 4) &&
13895 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13896 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13897 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13898 }
13899 if (isUndefOrInRange(HiMask, 4, 8) &&
13900 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13901 for (int i = 0; i != 4; ++i)
13902 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13903 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13904 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13905 }
13906
13907 SmallVector<int, 4> LoInputs;
13908 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13909 array_pod_sort(LoInputs.begin(), LoInputs.end());
13910 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
13911 SmallVector<int, 4> HiInputs;
13912 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13913 array_pod_sort(HiInputs.begin(), HiInputs.end());
13914 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
13915 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13916 int NumHToL = LoInputs.size() - NumLToL;
13917 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13918 int NumHToH = HiInputs.size() - NumLToH;
13919 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13920 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13921 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13922 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13923
13924 // If we are shuffling values from one half - check how many different DWORD
13925 // pairs we need to create. If only 1 or 2 then we can perform this as a
13926 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13927 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13928 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13929 V = DAG.getNode(ShufWOp, DL, VT, V,
13930 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13931 V = DAG.getBitcast(PSHUFDVT, V);
13932 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13933 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13934 return DAG.getBitcast(VT, V);
13935 };
13936
13937 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13938 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13939 SmallVector<std::pair<int, int>, 4> DWordPairs;
13940 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13941
13942 // Collect the different DWORD pairs.
13943 for (int DWord = 0; DWord != 4; ++DWord) {
13944 int M0 = Mask[2 * DWord + 0];
13945 int M1 = Mask[2 * DWord + 1];
13946 M0 = (M0 >= 0 ? M0 % 4 : M0);
13947 M1 = (M1 >= 0 ? M1 % 4 : M1);
13948 if (M0 < 0 && M1 < 0)
13949 continue;
13950
13951 bool Match = false;
13952 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13953 auto &DWordPair = DWordPairs[j];
13954 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13955 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13956 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13957 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13958 PSHUFDMask[DWord] = DOffset + j;
13959 Match = true;
13960 break;
13961 }
13962 }
13963 if (!Match) {
13964 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13965 DWordPairs.push_back(std::make_pair(M0, M1));
13966 }
13967 }
13968
13969 if (DWordPairs.size() <= 2) {
13970 DWordPairs.resize(2, std::make_pair(-1, -1));
13971 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13972 DWordPairs[1].first, DWordPairs[1].second};
13973 // For splat, ensure we widen the PSHUFDMask to allow vXi64 folds.
13974 if (ShuffleVectorSDNode::isSplatMask(PSHUFDMask) &&
13975 ShuffleVectorSDNode::isSplatMask(PSHUFHalfMask)) {
13976 int SplatIdx = ShuffleVectorSDNode::getSplatMaskIndex(PSHUFHalfMask);
13977 std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
13978 PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
13979 PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
13980 }
13981 if ((NumHToL + NumHToH) == 0)
13982 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13983 if ((NumLToL + NumLToH) == 0)
13984 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13985 }
13986 }
13987
13988 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13989 // such inputs we can swap two of the dwords across the half mark and end up
13990 // with <=2 inputs to each half in each half. Once there, we can fall through
13991 // to the generic code below. For example:
13992 //
13993 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13994 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13995 //
13996 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13997 // and an existing 2-into-2 on the other half. In this case we may have to
13998 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13999 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
14000 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
14001 // because any other situation (including a 3-into-1 or 1-into-3 in the other
14002 // half than the one we target for fixing) will be fixed when we re-enter this
14003 // path. We will also combine away any sequence of PSHUFD instructions that
14004 // result into a single instruction. Here is an example of the tricky case:
14005 //
14006 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14007 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
14008 //
14009 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
14010 //
14011 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
14012 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
14013 //
14014 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
14015 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
14016 //
14017 // The result is fine to be handled by the generic logic.
14018 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
14019 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
14020 int AOffset, int BOffset) {
14021 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
14022 "Must call this with A having 3 or 1 inputs from the A half.");
14023 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
14024 "Must call this with B having 1 or 3 inputs from the B half.");
14025 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
14026 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14027
14028 bool ThreeAInputs = AToAInputs.size() == 3;
14029
14030 // Compute the index of dword with only one word among the three inputs in
14031 // a half by taking the sum of the half with three inputs and subtracting
14032 // the sum of the actual three inputs. The difference is the remaining
14033 // slot.
14034 int ADWord = 0, BDWord = 0;
14035 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14036 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14037 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14038 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14039 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14040 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14041 int TripleNonInputIdx =
14042 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14043 TripleDWord = TripleNonInputIdx / 2;
14044
14045 // We use xor with one to compute the adjacent DWord to whichever one the
14046 // OneInput is in.
14047 OneInputDWord = (OneInput / 2) ^ 1;
14048
14049 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14050 // and BToA inputs. If there is also such a problem with the BToB and AToB
14051 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14052 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14053 // is essential that we don't *create* a 3<-1 as then we might oscillate.
14054 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14055 // Compute how many inputs will be flipped by swapping these DWords. We
14056 // need
14057 // to balance this to ensure we don't form a 3-1 shuffle in the other
14058 // half.
14059 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
14060 llvm::count(AToBInputs, 2 * ADWord + 1);
14061 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
14062 llvm::count(BToBInputs, 2 * BDWord + 1);
14063 if ((NumFlippedAToBInputs == 1 &&
14064 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14065 (NumFlippedBToBInputs == 1 &&
14066 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14067 // We choose whether to fix the A half or B half based on whether that
14068 // half has zero flipped inputs. At zero, we may not be able to fix it
14069 // with that half. We also bias towards fixing the B half because that
14070 // will more commonly be the high half, and we have to bias one way.
14071 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14072 ArrayRef<int> Inputs) {
14073 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14074 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14075 // Determine whether the free index is in the flipped dword or the
14076 // unflipped dword based on where the pinned index is. We use this bit
14077 // in an xor to conditionally select the adjacent dword.
14078 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14079 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14080 if (IsFixIdxInput == IsFixFreeIdxInput)
14081 FixFreeIdx += 1;
14082 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14083 assert(IsFixIdxInput != IsFixFreeIdxInput &&
14084 "We need to be changing the number of flipped inputs!");
14085 int PSHUFHalfMask[] = {0, 1, 2, 3};
14086 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14087 V = DAG.getNode(
14088 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14089 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14090 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14091
14092 for (int &M : Mask)
14093 if (M >= 0 && M == FixIdx)
14094 M = FixFreeIdx;
14095 else if (M >= 0 && M == FixFreeIdx)
14096 M = FixIdx;
14097 };
14098 if (NumFlippedBToBInputs != 0) {
14099 int BPinnedIdx =
14100 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14101 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14102 } else {
14103 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
14104 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14105 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14106 }
14107 }
14108 }
14109
14110 int PSHUFDMask[] = {0, 1, 2, 3};
14111 PSHUFDMask[ADWord] = BDWord;
14112 PSHUFDMask[BDWord] = ADWord;
14113 V = DAG.getBitcast(
14114 VT,
14115 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14116 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14117
14118 // Adjust the mask to match the new locations of A and B.
14119 for (int &M : Mask)
14120 if (M >= 0 && M/2 == ADWord)
14121 M = 2 * BDWord + M % 2;
14122 else if (M >= 0 && M/2 == BDWord)
14123 M = 2 * ADWord + M % 2;
14124
14125 // Recurse back into this routine to re-compute state now that this isn't
14126 // a 3 and 1 problem.
14127 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14128 };
14129 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14130 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14131 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14132 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14133
14134 // At this point there are at most two inputs to the low and high halves from
14135 // each half. That means the inputs can always be grouped into dwords and
14136 // those dwords can then be moved to the correct half with a dword shuffle.
14137 // We use at most one low and one high word shuffle to collect these paired
14138 // inputs into dwords, and finally a dword shuffle to place them.
14139 int PSHUFLMask[4] = {-1, -1, -1, -1};
14140 int PSHUFHMask[4] = {-1, -1, -1, -1};
14141 int PSHUFDMask[4] = {-1, -1, -1, -1};
14142
14143 // First fix the masks for all the inputs that are staying in their
14144 // original halves. This will then dictate the targets of the cross-half
14145 // shuffles.
14146 auto fixInPlaceInputs =
14147 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14148 MutableArrayRef<int> SourceHalfMask,
14149 MutableArrayRef<int> HalfMask, int HalfOffset) {
14150 if (InPlaceInputs.empty())
14151 return;
14152 if (InPlaceInputs.size() == 1) {
14153 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14154 InPlaceInputs[0] - HalfOffset;
14155 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14156 return;
14157 }
14158 if (IncomingInputs.empty()) {
14159 // Just fix all of the in place inputs.
14160 for (int Input : InPlaceInputs) {
14161 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14162 PSHUFDMask[Input / 2] = Input / 2;
14163 }
14164 return;
14165 }
14166
14167 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
14168 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14169 InPlaceInputs[0] - HalfOffset;
14170 // Put the second input next to the first so that they are packed into
14171 // a dword. We find the adjacent index by toggling the low bit.
14172 int AdjIndex = InPlaceInputs[0] ^ 1;
14173 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14174 llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);
14175 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14176 };
14177 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14178 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14179
14180 // Now gather the cross-half inputs and place them into a free dword of
14181 // their target half.
14182 // FIXME: This operation could almost certainly be simplified dramatically to
14183 // look more like the 3-1 fixing operation.
14184 auto moveInputsToRightHalf = [&PSHUFDMask](
14185 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14186 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14187 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14188 int DestOffset) {
14189 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14190 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14191 };
14192 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14193 int Word) {
14194 int LowWord = Word & ~1;
14195 int HighWord = Word | 1;
14196 return isWordClobbered(SourceHalfMask, LowWord) ||
14197 isWordClobbered(SourceHalfMask, HighWord);
14198 };
14199
14200 if (IncomingInputs.empty())
14201 return;
14202
14203 if (ExistingInputs.empty()) {
14204 // Map any dwords with inputs from them into the right half.
14205 for (int Input : IncomingInputs) {
14206 // If the source half mask maps over the inputs, turn those into
14207 // swaps and use the swapped lane.
14208 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14209 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14210 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14211 Input - SourceOffset;
14212 // We have to swap the uses in our half mask in one sweep.
14213 for (int &M : HalfMask)
14214 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14215 M = Input;
14216 else if (M == Input)
14217 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14218 } else {
14219 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
14220 Input - SourceOffset &&
14221 "Previous placement doesn't match!");
14222 }
14223 // Note that this correctly re-maps both when we do a swap and when
14224 // we observe the other side of the swap above. We rely on that to
14225 // avoid swapping the members of the input list directly.
14226 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14227 }
14228
14229 // Map the input's dword into the correct half.
14230 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14231 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14232 else
14233 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
14234 Input / 2 &&
14235 "Previous placement doesn't match!");
14236 }
14237
14238 // And just directly shift any other-half mask elements to be same-half
14239 // as we will have mirrored the dword containing the element into the
14240 // same position within that half.
14241 for (int &M : HalfMask)
14242 if (M >= SourceOffset && M < SourceOffset + 4) {
14243 M = M - SourceOffset + DestOffset;
14244 assert(M >= 0 && "This should never wrap below zero!");
14245 }
14246 return;
14247 }
14248
14249 // Ensure we have the input in a viable dword of its current half. This
14250 // is particularly tricky because the original position may be clobbered
14251 // by inputs being moved and *staying* in that half.
14252 if (IncomingInputs.size() == 1) {
14253 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14254 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14255 SourceOffset;
14256 SourceHalfMask[InputFixed - SourceOffset] =
14257 IncomingInputs[0] - SourceOffset;
14258 llvm::replace(HalfMask, IncomingInputs[0], InputFixed);
14259 IncomingInputs[0] = InputFixed;
14260 }
14261 } else if (IncomingInputs.size() == 2) {
14262 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14263 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14264 // We have two non-adjacent or clobbered inputs we need to extract from
14265 // the source half. To do this, we need to map them into some adjacent
14266 // dword slot in the source mask.
14267 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14268 IncomingInputs[1] - SourceOffset};
14269
14270 // If there is a free slot in the source half mask adjacent to one of
14271 // the inputs, place the other input in it. We use (Index XOR 1) to
14272 // compute an adjacent index.
14273 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14274 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14275 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14276 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14277 InputsFixed[1] = InputsFixed[0] ^ 1;
14278 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14279 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14280 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14281 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14282 InputsFixed[0] = InputsFixed[1] ^ 1;
14283 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14284 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14285 // The two inputs are in the same DWord but it is clobbered and the
14286 // adjacent DWord isn't used at all. Move both inputs to the free
14287 // slot.
14288 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14289 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14290 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14291 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14292 } else {
14293 // The only way we hit this point is if there is no clobbering
14294 // (because there are no off-half inputs to this half) and there is no
14295 // free slot adjacent to one of the inputs. In this case, we have to
14296 // swap an input with a non-input.
14297 for (int i = 0; i < 4; ++i)
14298 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14299 "We can't handle any clobbers here!");
14300 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14301 "Cannot have adjacent inputs here!");
14302
14303 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14304 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14305
14306 // We also have to update the final source mask in this case because
14307 // it may need to undo the above swap.
14308 for (int &M : FinalSourceHalfMask)
14309 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14310 M = InputsFixed[1] + SourceOffset;
14311 else if (M == InputsFixed[1] + SourceOffset)
14312 M = (InputsFixed[0] ^ 1) + SourceOffset;
14313
14314 InputsFixed[1] = InputsFixed[0] ^ 1;
14315 }
14316
14317 // Point everything at the fixed inputs.
14318 for (int &M : HalfMask)
14319 if (M == IncomingInputs[0])
14320 M = InputsFixed[0] + SourceOffset;
14321 else if (M == IncomingInputs[1])
14322 M = InputsFixed[1] + SourceOffset;
14323
14324 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14325 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14326 }
14327 } else {
14328 llvm_unreachable("Unhandled input size!");
14329 }
14330
14331 // Now hoist the DWord down to the right half.
14332 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14333 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14334 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14335 for (int &M : HalfMask)
14336 for (int Input : IncomingInputs)
14337 if (M == Input)
14338 M = FreeDWord * 2 + Input % 2;
14339 };
14340 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14341 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14342 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14343 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14344
14345 // Now enact all the shuffles we've computed to move the inputs into their
14346 // target half.
14347 if (!isNoopShuffleMask(PSHUFLMask))
14348 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14349 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14350 if (!isNoopShuffleMask(PSHUFHMask))
14351 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14352 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14353 if (!isNoopShuffleMask(PSHUFDMask))
14354 V = DAG.getBitcast(
14355 VT,
14356 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14357 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14358
14359 // At this point, each half should contain all its inputs, and we can then
14360 // just shuffle them into their final position.
14361 assert(none_of(LoMask, [](int M) { return M >= 4; }) &&
14362 "Failed to lift all the high half inputs to the low mask!");
14363 assert(none_of(HiMask, [](int M) { return M >= 0 && M < 4; }) &&
14364 "Failed to lift all the low half inputs to the high mask!");
14365
14366 // Do a half shuffle for the low mask.
14367 if (!isNoopShuffleMask(LoMask))
14368 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14369 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14370
14371 // Do a half shuffle with the high mask after shifting its values down.
14372 for (int &M : HiMask)
14373 if (M >= 0)
14374 M -= 4;
14375 if (!isNoopShuffleMask(HiMask))
14376 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14377 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14378
14379 return V;
14380}
14381
14382/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14383/// blend if only one input is used.
14385 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14386 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14388 "Lane crossing shuffle masks not supported");
14389
14390 int NumBytes = VT.getSizeInBits() / 8;
14391 int Size = Mask.size();
14392 int Scale = NumBytes / Size;
14393
14394 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14395 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14396 V1InUse = false;
14397 V2InUse = false;
14398
14399 for (int i = 0; i < NumBytes; ++i) {
14400 int M = Mask[i / Scale];
14401 if (M < 0)
14402 continue;
14403
14404 const int ZeroMask = 0x80;
14405 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14406 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14407 if (Zeroable[i / Scale])
14408 V1Idx = V2Idx = ZeroMask;
14409
14410 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14411 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14412 V1InUse |= (ZeroMask != V1Idx);
14413 V2InUse |= (ZeroMask != V2Idx);
14414 }
14415
14416 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14417 if (V1InUse)
14418 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14419 DAG.getBuildVector(ShufVT, DL, V1Mask));
14420 if (V2InUse)
14421 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14422 DAG.getBuildVector(ShufVT, DL, V2Mask));
14423
14424 // If we need shuffled inputs from both, blend the two.
14425 SDValue V;
14426 if (V1InUse && V2InUse)
14427 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14428 else
14429 V = V1InUse ? V1 : V2;
14430
14431 // Cast the result back to the correct type.
14432 return DAG.getBitcast(VT, V);
14433}
14434
14435/// Generic lowering of 8-lane i16 shuffles.
14436///
14437/// This handles both single-input shuffles and combined shuffle/blends with
14438/// two inputs. The single input shuffles are immediately delegated to
14439/// a dedicated lowering routine.
14440///
14441/// The blends are lowered in one of three fundamental ways. If there are few
14442/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14443/// of the input is significantly cheaper when lowered as an interleaving of
14444/// the two inputs, try to interleave them. Otherwise, blend the low and high
14445/// halves of the inputs separately (making them have relatively few inputs)
14446/// and then concatenate them.
14448 const APInt &Zeroable, SDValue V1, SDValue V2,
14449 const X86Subtarget &Subtarget,
14450 SelectionDAG &DAG) {
14451 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14452 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14453 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14454
14455 // Whenever we can lower this as a zext, that instruction is strictly faster
14456 // than any alternative.
14457 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
14458 Zeroable, Subtarget, DAG))
14459 return ZExt;
14460
14461 // Try to use lower using a truncation.
14462 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14463 Subtarget, DAG))
14464 return V;
14465
14466 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
14467
14468 if (NumV2Inputs == 0) {
14469 // Try to use shift instructions.
14470 if (SDValue Shift =
14471 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
14472 Subtarget, DAG, /*BitwiseOnly*/ false))
14473 return Shift;
14474
14475 // Check for being able to broadcast a single element.
14476 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
14477 Mask, Subtarget, DAG))
14478 return Broadcast;
14479
14480 // Try to use bit rotation instructions.
14481 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
14482 Subtarget, DAG))
14483 return Rotate;
14484
14485 // Use dedicated unpack instructions for masks that match their pattern.
14486 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14487 return V;
14488
14489 // Use dedicated pack instructions for masks that match their pattern.
14490 if (SDValue V =
14491 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14492 return V;
14493
14494 // Try to use byte rotation instructions.
14495 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
14496 Subtarget, DAG))
14497 return Rotate;
14498
14499 // Make a copy of the mask so it can be modified.
14500 SmallVector<int, 8> MutableMask(Mask);
14501 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
14502 Subtarget, DAG);
14503 }
14504
14505 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
14506 "All single-input shuffles should be canonicalized to be V1-input "
14507 "shuffles.");
14508
14509 // Try to use shift instructions.
14510 if (SDValue Shift =
14511 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
14512 DAG, /*BitwiseOnly*/ false))
14513 return Shift;
14514
14515 // See if we can use SSE4A Extraction / Insertion.
14516 if (Subtarget.hasSSE4A())
14517 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
14518 Zeroable, DAG))
14519 return V;
14520
14521 // There are special ways we can lower some single-element blends.
14522 if (NumV2Inputs == 1)
14524 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14525 return V;
14526
14527 // We have different paths for blend lowering, but they all must use the
14528 // *exact* same predicate.
14529 bool IsBlendSupported = Subtarget.hasSSE41();
14530 if (IsBlendSupported)
14531 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
14532 Zeroable, Subtarget, DAG))
14533 return Blend;
14534
14535 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
14536 Zeroable, Subtarget, DAG))
14537 return Masked;
14538
14539 // Use dedicated unpack instructions for masks that match their pattern.
14540 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14541 return V;
14542
14543 // Use dedicated pack instructions for masks that match their pattern.
14544 if (SDValue V =
14545 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14546 return V;
14547
14548 // Try to use lower using a truncation.
14549 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14550 Subtarget, DAG))
14551 return V;
14552
14553 // Try to use byte rotation instructions.
14554 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
14555 Subtarget, DAG))
14556 return Rotate;
14557
14558 if (SDValue BitBlend =
14559 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14560 return BitBlend;
14561
14562 // Try to use byte shift instructions to mask.
14563 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
14564 Zeroable, Subtarget, DAG))
14565 return V;
14566
14567 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
14568 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
14569 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
14570 !Subtarget.hasVLX()) {
14571 // Check if this is part of a 256-bit vector truncation.
14572 unsigned PackOpc = 0;
14573 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
14576 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
14577 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
14578 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
14579 DAG.getTargetConstant(0xEE, DL, MVT::i8));
14580 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
14581 V1 = extract128BitVector(V1V2, 0, DAG, DL);
14582 V2 = extract128BitVector(V1V2, 4, DAG, DL);
14583 PackOpc = X86ISD::PACKUS;
14584 } else if (Subtarget.hasSSE41()) {
14585 SmallVector<SDValue, 4> DWordClearOps(4,
14586 DAG.getConstant(0, DL, MVT::i32));
14587 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14588 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
14589 SDValue DWordClearMask =
14590 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
14591 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
14592 DWordClearMask);
14593 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
14594 DWordClearMask);
14595 PackOpc = X86ISD::PACKUS;
14596 } else if (!Subtarget.hasSSSE3()) {
14597 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
14598 V1 = DAG.getBitcast(MVT::v4i32, V1);
14599 V2 = DAG.getBitcast(MVT::v4i32, V2);
14600 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
14601 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
14602 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
14603 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14604 PackOpc = X86ISD::PACKSS;
14605 }
14606 if (PackOpc) {
14607 // Now pack things back together.
14608 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14609 if (NumEvenDrops == 2) {
14610 Result = DAG.getBitcast(MVT::v4i32, Result);
14611 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14612 }
14613 return Result;
14614 }
14615 }
14616
14617 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14618 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14619 if (NumOddDrops == 1) {
14620 bool HasSSE41 = Subtarget.hasSSE41();
14621 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14622 DAG.getBitcast(MVT::v4i32, V1),
14623 DAG.getTargetConstant(16, DL, MVT::i8));
14624 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14625 DAG.getBitcast(MVT::v4i32, V2),
14626 DAG.getTargetConstant(16, DL, MVT::i8));
14627 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14628 MVT::v8i16, V1, V2);
14629 }
14630
14631 // Try to lower by permuting the inputs into an unpack instruction.
14632 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14633 Mask, Subtarget, DAG))
14634 return Unpack;
14635
14636 // If we can't directly blend but can use PSHUFB, that will be better as it
14637 // can both shuffle and set up the inefficient blend.
14638 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14639 bool V1InUse, V2InUse;
14640 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14641 Zeroable, DAG, V1InUse, V2InUse);
14642 }
14643
14644 // We can always bit-blend if we have to so the fallback strategy is to
14645 // decompose into single-input permutes and blends/unpacks.
14646 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
14647 Zeroable, Subtarget, DAG);
14648}
14649
14650/// Lower 8-lane 16-bit floating point shuffles.
14652 const APInt &Zeroable, SDValue V1, SDValue V2,
14653 const X86Subtarget &Subtarget,
14654 SelectionDAG &DAG) {
14655 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14656 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14657 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14658 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14659
14660 if (Subtarget.hasFP16()) {
14661 if (NumV2Elements == 0) {
14662 // Check for being able to broadcast a single element.
14663 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14664 Mask, Subtarget, DAG))
14665 return Broadcast;
14666 }
14667 if (NumV2Elements == 1 && Mask[0] >= 8)
14669 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14670 return V;
14671 }
14672
14673 V1 = DAG.getBitcast(MVT::v8i16, V1);
14674 V2 = DAG.getBitcast(MVT::v8i16, V2);
14675 return DAG.getBitcast(MVT::v8f16,
14676 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14677}
14678
14679// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14680// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14681// the active subvector is extracted.
14683 ArrayRef<int> OriginalMask, SDValue V1,
14684 SDValue V2, const X86Subtarget &Subtarget,
14685 SelectionDAG &DAG) {
14686 // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
14687 SmallVector<int, 32> Mask(OriginalMask);
14688 if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
14689 !isShuffleFoldableLoad(V2)) {
14691 std::swap(V1, V2);
14692 }
14693
14694 MVT MaskVT = VT.changeTypeToInteger();
14695 SDValue MaskNode;
14696 MVT ShuffleVT = VT;
14697 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14698 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14699 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14700 ShuffleVT = V1.getSimpleValueType();
14701
14702 // Adjust mask to correct indices for the second input.
14703 int NumElts = VT.getVectorNumElements();
14704 unsigned Scale = 512 / VT.getSizeInBits();
14705 SmallVector<int, 32> AdjustedMask(Mask);
14706 for (int &M : AdjustedMask)
14707 if (NumElts <= M)
14708 M += (Scale - 1) * NumElts;
14709 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14710 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14711 } else {
14712 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14713 }
14714
14715 SDValue Result;
14716 if (V2.isUndef())
14717 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14718 else
14719 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14720
14721 if (VT != ShuffleVT)
14722 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14723
14724 return Result;
14725}
14726
14727/// Generic lowering of v16i8 shuffles.
14728///
14729/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14730/// detect any complexity reducing interleaving. If that doesn't help, it uses
14731/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14732/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14733/// back together.
14735 const APInt &Zeroable, SDValue V1, SDValue V2,
14736 const X86Subtarget &Subtarget,
14737 SelectionDAG &DAG) {
14738 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14739 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14740 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14741
14742 // Try to use shift instructions.
14743 if (SDValue Shift =
14744 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14745 DAG, /*BitwiseOnly*/ false))
14746 return Shift;
14747
14748 // Try to use byte rotation instructions.
14749 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14750 Subtarget, DAG))
14751 return Rotate;
14752
14753 // Use dedicated pack instructions for masks that match their pattern.
14754 if (SDValue V =
14755 lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14756 return V;
14757
14758 // Try to use a zext lowering.
14759 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14760 Zeroable, Subtarget, DAG))
14761 return ZExt;
14762
14763 // Try to use lower using a truncation.
14764 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14765 Subtarget, DAG))
14766 return V;
14767
14768 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14769 Subtarget, DAG))
14770 return V;
14771
14772 // See if we can use SSE4A Extraction / Insertion.
14773 if (Subtarget.hasSSE4A())
14774 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14775 Zeroable, DAG))
14776 return V;
14777
14778 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14779
14780 // For single-input shuffles, there are some nicer lowering tricks we can use.
14781 if (NumV2Elements == 0) {
14782 // Check for being able to broadcast a single element.
14783 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14784 Mask, Subtarget, DAG))
14785 return Broadcast;
14786
14787 // Try to use bit rotation instructions.
14788 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14789 Subtarget, DAG))
14790 return Rotate;
14791
14792 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14793 return V;
14794
14795 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14796 // Notably, this handles splat and partial-splat shuffles more efficiently.
14797 // However, it only makes sense if the pre-duplication shuffle simplifies
14798 // things significantly. Currently, this means we need to be able to
14799 // express the pre-duplication shuffle as an i16 shuffle.
14800 //
14801 // FIXME: We should check for other patterns which can be widened into an
14802 // i16 shuffle as well.
14803 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14804 for (int i = 0; i < 16; i += 2)
14805 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14806 return false;
14807
14808 return true;
14809 };
14810 auto tryToWidenViaDuplication = [&]() -> SDValue {
14811 if (!canWidenViaDuplication(Mask))
14812 return SDValue();
14813 SmallVector<int, 4> LoInputs;
14814 copy_if(Mask, std::back_inserter(LoInputs),
14815 [](int M) { return M >= 0 && M < 8; });
14816 array_pod_sort(LoInputs.begin(), LoInputs.end());
14817 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14818 SmallVector<int, 4> HiInputs;
14819 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14820 array_pod_sort(HiInputs.begin(), HiInputs.end());
14821 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14822
14823 bool TargetLo = LoInputs.size() >= HiInputs.size();
14824 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14825 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14826
14827 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14829 for (int I : InPlaceInputs) {
14830 PreDupI16Shuffle[I/2] = I/2;
14831 LaneMap[I] = I;
14832 }
14833 int j = TargetLo ? 0 : 4, je = j + 4;
14834 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14835 // Check if j is already a shuffle of this input. This happens when
14836 // there are two adjacent bytes after we move the low one.
14837 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14838 // If we haven't yet mapped the input, search for a slot into which
14839 // we can map it.
14840 while (j < je && PreDupI16Shuffle[j] >= 0)
14841 ++j;
14842
14843 if (j == je)
14844 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14845 return SDValue();
14846
14847 // Map this input with the i16 shuffle.
14848 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14849 }
14850
14851 // Update the lane map based on the mapping we ended up with.
14852 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14853 }
14854 V1 = DAG.getBitcast(
14855 MVT::v16i8,
14856 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14857 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14858
14859 // Unpack the bytes to form the i16s that will be shuffled into place.
14860 bool EvenInUse = false, OddInUse = false;
14861 for (int i = 0; i < 16; i += 2) {
14862 EvenInUse |= (Mask[i + 0] >= 0);
14863 OddInUse |= (Mask[i + 1] >= 0);
14864 if (EvenInUse && OddInUse)
14865 break;
14866 }
14867 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14868 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14869 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14870
14871 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14872 for (int i = 0; i < 16; ++i)
14873 if (Mask[i] >= 0) {
14874 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14875 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14876 if (PostDupI16Shuffle[i / 2] < 0)
14877 PostDupI16Shuffle[i / 2] = MappedMask;
14878 else
14879 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14880 "Conflicting entries in the original shuffle!");
14881 }
14882 return DAG.getBitcast(
14883 MVT::v16i8,
14884 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14885 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14886 };
14887 if (SDValue V = tryToWidenViaDuplication())
14888 return V;
14889 }
14890
14891 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14892 Zeroable, Subtarget, DAG))
14893 return Masked;
14894
14895 // Use dedicated unpack instructions for masks that match their pattern.
14896 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14897 return V;
14898
14899 // Try to use byte shift instructions to mask.
14900 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14901 Zeroable, Subtarget, DAG))
14902 return V;
14903
14904 // Check for compaction patterns.
14905 bool IsSingleInput = V2.isUndef();
14906 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14907
14908 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14909 // with PSHUFB. It is important to do this before we attempt to generate any
14910 // blends but after all of the single-input lowerings. If the single input
14911 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14912 // want to preserve that and we can DAG combine any longer sequences into
14913 // a PSHUFB in the end. But once we start blending from multiple inputs,
14914 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14915 // and there are *very* few patterns that would actually be faster than the
14916 // PSHUFB approach because of its ability to zero lanes.
14917 //
14918 // If the mask is a binary compaction, we can more efficiently perform this
14919 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14920 //
14921 // FIXME: The only exceptions to the above are blends which are exact
14922 // interleavings with direct instructions supporting them. We currently don't
14923 // handle those well here.
14924 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14925 bool V1InUse = false;
14926 bool V2InUse = false;
14927
14929 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14930
14931 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14932 // do so. This avoids using them to handle blends-with-zero which is
14933 // important as a single pshufb is significantly faster for that.
14934 if (V1InUse && V2InUse) {
14935 if (Subtarget.hasSSE41())
14936 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14937 Zeroable, Subtarget, DAG))
14938 return Blend;
14939
14940 // We can use an unpack to do the blending rather than an or in some
14941 // cases. Even though the or may be (very minorly) more efficient, we
14942 // preference this lowering because there are common cases where part of
14943 // the complexity of the shuffles goes away when we do the final blend as
14944 // an unpack.
14945 // FIXME: It might be worth trying to detect if the unpack-feeding
14946 // shuffles will both be pshufb, in which case we shouldn't bother with
14947 // this.
14949 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14950 return Unpack;
14951
14952 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14953 if (Subtarget.hasVBMI())
14954 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14955 DAG);
14956
14957 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14958 if (Subtarget.hasXOP()) {
14959 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14960 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14961 }
14962
14963 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14964 // PALIGNR will be cheaper than the second PSHUFB+OR.
14966 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14967 return V;
14968 }
14969
14970 return PSHUFB;
14971 }
14972
14973 // There are special ways we can lower some single-element blends.
14974 if (NumV2Elements == 1)
14976 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14977 return V;
14978
14979 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14980 return Blend;
14981
14982 // Check whether a compaction lowering can be done. This handles shuffles
14983 // which take every Nth element for some even N. See the helper function for
14984 // details.
14985 //
14986 // We special case these as they can be particularly efficiently handled with
14987 // the PACKUSB instruction on x86 and they show up in common patterns of
14988 // rearranging bytes to truncate wide elements.
14989 if (NumEvenDrops) {
14990 // NumEvenDrops is the power of two stride of the elements. Another way of
14991 // thinking about it is that we need to drop the even elements this many
14992 // times to get the original input.
14993
14994 // First we need to zero all the dropped bytes.
14995 assert(NumEvenDrops <= 3 &&
14996 "No support for dropping even elements more than 3 times.");
14997 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
14998 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14999 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
15000 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
15001 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
15002 WordClearMask);
15003 if (!IsSingleInput)
15004 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
15005 WordClearMask);
15006
15007 // Now pack things back together.
15008 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15009 IsSingleInput ? V1 : V2);
15010 for (int i = 1; i < NumEvenDrops; ++i) {
15011 Result = DAG.getBitcast(MVT::v8i16, Result);
15012 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
15013 }
15014 return Result;
15015 }
15016
15017 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
15018 if (NumOddDrops == 1) {
15019 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15020 DAG.getBitcast(MVT::v8i16, V1),
15021 DAG.getTargetConstant(8, DL, MVT::i8));
15022 if (!IsSingleInput)
15023 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15024 DAG.getBitcast(MVT::v8i16, V2),
15025 DAG.getTargetConstant(8, DL, MVT::i8));
15026 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15027 IsSingleInput ? V1 : V2);
15028 }
15029
15030 // Handle multi-input cases by blending/unpacking single-input shuffles.
15031 if (NumV2Elements > 0)
15032 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15033 Zeroable, Subtarget, DAG);
15034
15035 // The fallback path for single-input shuffles widens this into two v8i16
15036 // vectors with unpacks, shuffles those, and then pulls them back together
15037 // with a pack.
15038 SDValue V = V1;
15039
15040 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15041 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15042 for (int i = 0; i < 16; ++i)
15043 if (Mask[i] >= 0)
15044 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15045
15046 SDValue VLoHalf, VHiHalf;
15047 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15048 // them out and avoid using UNPCK{L,H} to extract the elements of V as
15049 // i16s.
15050 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15051 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15052 // Use a mask to drop the high bytes.
15053 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15054 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15055 DAG.getConstant(0x00FF, DL, MVT::v8i16));
15056
15057 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15058 VHiHalf = DAG.getUNDEF(MVT::v8i16);
15059
15060 // Squash the masks to point directly into VLoHalf.
15061 for (int &M : LoBlendMask)
15062 if (M >= 0)
15063 M /= 2;
15064 for (int &M : HiBlendMask)
15065 if (M >= 0)
15066 M /= 2;
15067 } else {
15068 // Otherwise just unpack the low half of V into VLoHalf and the high half into
15069 // VHiHalf so that we can blend them as i16s.
15070 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15071
15072 VLoHalf = DAG.getBitcast(
15073 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15074 VHiHalf = DAG.getBitcast(
15075 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15076 }
15077
15078 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15079 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15080
15081 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15082}
15083
15084/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15085///
15086/// This routine breaks down the specific type of 128-bit shuffle and
15087/// dispatches to the lowering routines accordingly.
15089 MVT VT, SDValue V1, SDValue V2,
15090 const APInt &Zeroable,
15091 const X86Subtarget &Subtarget,
15092 SelectionDAG &DAG) {
15093 if (VT == MVT::v8bf16) {
15094 V1 = DAG.getBitcast(MVT::v8i16, V1);
15095 V2 = DAG.getBitcast(MVT::v8i16, V2);
15096 return DAG.getBitcast(VT,
15097 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15098 }
15099
15100 switch (VT.SimpleTy) {
15101 case MVT::v2i64:
15102 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15103 case MVT::v2f64:
15104 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15105 case MVT::v4i32:
15106 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15107 case MVT::v4f32:
15108 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15109 case MVT::v8i16:
15110 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15111 case MVT::v8f16:
15112 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15113 case MVT::v16i8:
15114 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15115
15116 default:
15117 llvm_unreachable("Unimplemented!");
15118 }
15119}
15120
15121/// Generic routine to split vector shuffle into half-sized shuffles.
15122///
15123/// This routine just extracts two subvectors, shuffles them independently, and
15124/// then concatenates them back together. This should work effectively with all
15125/// AVX vector shuffle types.
15127 SDValue V2, ArrayRef<int> Mask,
15128 SelectionDAG &DAG, bool SimpleOnly) {
15129 assert(VT.getSizeInBits() >= 256 &&
15130 "Only for 256-bit or wider vector shuffles!");
15131 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
15132 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
15133
15134 // If this came from the AVX1 v8i32 -> v8f32 bitcast, split using v4i32.
15135 if (VT == MVT::v8f32) {
15136 SDValue BC1 = peekThroughBitcasts(V1);
15137 SDValue BC2 = peekThroughBitcasts(V2);
15138 if (BC1.getValueType() == MVT::v8i32 && BC2.getValueType() == MVT::v8i32) {
15139 if (SDValue Split = splitAndLowerShuffle(DL, MVT::v8i32, BC1, BC2, Mask,
15140 DAG, SimpleOnly))
15141 return DAG.getBitcast(VT, Split);
15142 }
15143 }
15144
15145 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15146 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15147
15148 int NumElements = VT.getVectorNumElements();
15149 int SplitNumElements = NumElements / 2;
15150 MVT ScalarVT = VT.getVectorElementType();
15151 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15152
15153 // Use splitVector/extractSubVector so that split build-vectors just build two
15154 // narrower build vectors. This helps shuffling with splats and zeros.
15155 auto SplitVector = [&](SDValue V) {
15156 SDValue LoV, HiV;
15157 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15158 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15159 DAG.getBitcast(SplitVT, HiV));
15160 };
15161
15162 SDValue LoV1, HiV1, LoV2, HiV2;
15163 std::tie(LoV1, HiV1) = SplitVector(V1);
15164 std::tie(LoV2, HiV2) = SplitVector(V2);
15165
15166 // Now create two 4-way blends of these half-width vectors.
15167 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
15168 bool &UseHiV1, bool &UseLoV2,
15169 bool &UseHiV2) {
15170 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
15171 for (int i = 0; i < SplitNumElements; ++i) {
15172 int M = HalfMask[i];
15173 if (M >= NumElements) {
15174 if (M >= NumElements + SplitNumElements)
15175 UseHiV2 = true;
15176 else
15177 UseLoV2 = true;
15178 } else if (M >= 0) {
15179 if (M >= SplitNumElements)
15180 UseHiV1 = true;
15181 else
15182 UseLoV1 = true;
15183 }
15184 }
15185 };
15186
15187 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
15188 if (!SimpleOnly)
15189 return true;
15190
15191 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15192 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15193
15194 return !(UseHiV1 || UseHiV2);
15195 };
15196
15197 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15198 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15199 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15200 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15201 for (int i = 0; i < SplitNumElements; ++i) {
15202 int M = HalfMask[i];
15203 if (M >= NumElements) {
15204 V2BlendMask[i] = M - NumElements;
15205 BlendMask[i] = SplitNumElements + i;
15206 } else if (M >= 0) {
15207 V1BlendMask[i] = M;
15208 BlendMask[i] = i;
15209 }
15210 }
15211
15212 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15213 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15214
15215 // Because the lowering happens after all combining takes place, we need to
15216 // manually combine these blend masks as much as possible so that we create
15217 // a minimal number of high-level vector shuffle nodes.
15218 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
15219
15220 // First try just blending the halves of V1 or V2.
15221 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15222 return DAG.getUNDEF(SplitVT);
15223 if (!UseLoV2 && !UseHiV2)
15224 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15225 if (!UseLoV1 && !UseHiV1)
15226 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15227
15228 SDValue V1Blend, V2Blend;
15229 if (UseLoV1 && UseHiV1) {
15230 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15231 } else {
15232 // We only use half of V1 so map the usage down into the final blend mask.
15233 V1Blend = UseLoV1 ? LoV1 : HiV1;
15234 for (int i = 0; i < SplitNumElements; ++i)
15235 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15236 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15237 }
15238 if (UseLoV2 && UseHiV2) {
15239 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15240 } else {
15241 // We only use half of V2 so map the usage down into the final blend mask.
15242 V2Blend = UseLoV2 ? LoV2 : HiV2;
15243 for (int i = 0; i < SplitNumElements; ++i)
15244 if (BlendMask[i] >= SplitNumElements)
15245 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15246 }
15247 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15248 };
15249
15250 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
15251 return SDValue();
15252
15253 SDValue Lo = HalfBlend(LoMask);
15254 SDValue Hi = HalfBlend(HiMask);
15255 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15256}
15257
15258/// Either split a vector in halves or decompose the shuffles and the
15259/// blend/unpack.
15260///
15261/// This is provided as a good fallback for many lowerings of non-single-input
15262/// shuffles with more than one 128-bit lane. In those cases, we want to select
15263/// between splitting the shuffle into 128-bit components and stitching those
15264/// back together vs. extracting the single-input shuffles and blending those
15265/// results.
15267 SDValue V2, ArrayRef<int> Mask,
15268 const APInt &Zeroable,
15269 const X86Subtarget &Subtarget,
15270 SelectionDAG &DAG) {
15271 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
15272 "shuffles as it could then recurse on itself.");
15273 int Size = Mask.size();
15274
15275 // If this can be modeled as a broadcast of two elements followed by a blend,
15276 // prefer that lowering. This is especially important because broadcasts can
15277 // often fold with memory operands.
15278 auto DoBothBroadcast = [&] {
15279 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15280 for (int M : Mask)
15281 if (M >= Size) {
15282 if (V2BroadcastIdx < 0)
15283 V2BroadcastIdx = M - Size;
15284 else if ((M - Size) != V2BroadcastIdx &&
15285 !IsElementEquivalent(Size, V2, V2, M - Size, V2BroadcastIdx))
15286 return false;
15287 } else if (M >= 0) {
15288 if (V1BroadcastIdx < 0)
15289 V1BroadcastIdx = M;
15290 else if (M != V1BroadcastIdx &&
15291 !IsElementEquivalent(Size, V1, V1, M, V1BroadcastIdx))
15292 return false;
15293 }
15294 return true;
15295 };
15296 if (DoBothBroadcast())
15297 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15298 Subtarget, DAG);
15299
15300 // If the inputs all stem from a single 128-bit lane of each input, then we
15301 // split them rather than blending because the split will decompose to
15302 // unusually few instructions.
15303 int LaneCount = VT.getSizeInBits() / 128;
15304 int LaneSize = Size / LaneCount;
15305 SmallBitVector LaneInputs[2];
15306 LaneInputs[0].resize(LaneCount, false);
15307 LaneInputs[1].resize(LaneCount, false);
15308 for (int i = 0; i < Size; ++i)
15309 if (Mask[i] >= 0)
15310 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15311 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15312 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15313 /*SimpleOnly*/ false);
15314
15315 // Without AVX2, if we can freely split the subvectors then we're better off
15316 // performing half width shuffles.
15317 if (!Subtarget.hasAVX2()) {
15318 SDValue BC1 = peekThroughBitcasts(V1);
15319 SDValue BC2 = peekThroughBitcasts(V2);
15320 bool SplatOrSplitV1 = isFreeToSplitVector(BC1, DAG) ||
15321 DAG.isSplatValue(BC1, /*AllowUndefs=*/true);
15322 bool SplatOrSplitV2 = isFreeToSplitVector(BC2, DAG) ||
15323 DAG.isSplatValue(BC2, /*AllowUndefs=*/true);
15324 if (SplatOrSplitV1 && SplatOrSplitV2)
15325 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15326 /*SimpleOnly*/ false);
15327 }
15328
15329 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15330 // requires that the decomposed single-input shuffles don't end up here.
15331 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15332 Subtarget, DAG);
15333}
15334
15335// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15336// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15338 SDValue V1, SDValue V2,
15339 ArrayRef<int> Mask,
15340 SelectionDAG &DAG) {
15341 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15342
15343 int LHSMask[4] = {-1, -1, -1, -1};
15344 int RHSMask[4] = {-1, -1, -1, -1};
15345 int SHUFPDMask[4] = {-1, -1, -1, -1};
15346
15347 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15348 // perform the shuffle once the lanes have been shuffled in place.
15349 for (int i = 0; i != 4; ++i) {
15350 int M = Mask[i];
15351 if (M < 0)
15352 continue;
15353 int LaneBase = i & ~1;
15354 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15355 LaneMask[LaneBase + (M & 1)] = M;
15356 SHUFPDMask[i] = M & 1;
15357 }
15358
15359 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15360 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15361 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15362 getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
15363}
15364
15365/// Lower a vector shuffle crossing multiple 128-bit lanes as
15366/// a lane permutation followed by a per-lane permutation.
15367///
15368/// This is mainly for cases where we can have non-repeating permutes
15369/// in each lane.
15370///
15371/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15372/// we should investigate merging them.
15374 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15375 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15376 int NumElts = VT.getVectorNumElements();
15377 int NumLanes = VT.getSizeInBits() / 128;
15378 int NumEltsPerLane = NumElts / NumLanes;
15379 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15380
15381 /// Attempts to find a sublane permute with the given size
15382 /// that gets all elements into their target lanes.
15383 ///
15384 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15385 /// If unsuccessful, returns false and may overwrite InLaneMask.
15386 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15387 int NumSublanesPerLane = NumSublanes / NumLanes;
15388 int NumEltsPerSublane = NumElts / NumSublanes;
15389
15390 SmallVector<int, 16> CrossLaneMask;
15391 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15392 // CrossLaneMask but one entry == one sublane.
15393 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15394 APInt DemandedCrossLane = APInt::getZero(NumElts);
15395
15396 for (int i = 0; i != NumElts; ++i) {
15397 int M = Mask[i];
15398 if (M < 0)
15399 continue;
15400
15401 int SrcSublane = M / NumEltsPerSublane;
15402 int DstLane = i / NumEltsPerLane;
15403
15404 // We only need to get the elements into the right lane, not sublane.
15405 // So search all sublanes that make up the destination lane.
15406 bool Found = false;
15407 int DstSubStart = DstLane * NumSublanesPerLane;
15408 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15409 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15410 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15411 continue;
15412
15413 Found = true;
15414 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15415 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15416 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15417 DemandedCrossLane.setBit(InLaneMask[i]);
15418 break;
15419 }
15420 if (!Found)
15421 return SDValue();
15422 }
15423
15424 // Fill CrossLaneMask using CrossLaneMaskLarge.
15425 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15426
15427 if (!CanUseSublanes) {
15428 // If we're only shuffling a single lowest lane and the rest are identity
15429 // then don't bother.
15430 // TODO - isShuffleMaskInputInPlace could be extended to something like
15431 // this.
15432 int NumIdentityLanes = 0;
15433 bool OnlyShuffleLowestLane = true;
15434 for (int i = 0; i != NumLanes; ++i) {
15435 int LaneOffset = i * NumEltsPerLane;
15436 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15437 i * NumEltsPerLane))
15438 NumIdentityLanes++;
15439 else if (CrossLaneMask[LaneOffset] != 0)
15440 OnlyShuffleLowestLane = false;
15441 }
15442 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15443 return SDValue();
15444 }
15445
15446 // Simplify CrossLaneMask based on the actual demanded elements.
15447 if (V1.hasOneUse())
15448 for (int i = 0; i != NumElts; ++i)
15449 if (!DemandedCrossLane[i])
15450 CrossLaneMask[i] = SM_SentinelUndef;
15451
15452 // Avoid returning the same shuffle operation. For example,
15453 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
15454 // undef:v16i16
15455 if (CrossLaneMask == Mask || InLaneMask == Mask)
15456 return SDValue();
15457
15458 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15459 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15460 InLaneMask);
15461 };
15462
15463 // First attempt a solution with full lanes.
15464 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15465 return V;
15466
15467 // The rest of the solutions use sublanes.
15468 if (!CanUseSublanes)
15469 return SDValue();
15470
15471 // Then attempt a solution with 64-bit sublanes (vpermq).
15472 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15473 return V;
15474
15475 // If that doesn't work and we have fast variable cross-lane shuffle,
15476 // attempt 32-bit sublanes (vpermd).
15477 if (!Subtarget.hasFastVariableCrossLaneShuffle())
15478 return SDValue();
15479
15480 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
15481}
15482
15483/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
15484static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
15485 SmallVector<int> &InLaneMask) {
15486 int Size = Mask.size();
15487 InLaneMask.assign(Mask.begin(), Mask.end());
15488 for (int i = 0; i < Size; ++i) {
15489 int &M = InLaneMask[i];
15490 if (M < 0)
15491 continue;
15492 if (((M % Size) / LaneSize) != (i / LaneSize))
15493 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
15494 }
15495}
15496
15497/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15498/// source with a lane permutation.
15499///
15500/// This lowering strategy results in four instructions in the worst case for a
15501/// single-input cross lane shuffle which is lower than any other fully general
15502/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15503/// shuffle pattern should be handled prior to trying this lowering.
15505 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15506 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15507 // FIXME: This should probably be generalized for 512-bit vectors as well.
15508 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
15509 int Size = Mask.size();
15510 int LaneSize = Size / 2;
15511
15512 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15513 // Only do this if the elements aren't all from the lower lane,
15514 // otherwise we're (probably) better off doing a split.
15515 if (VT == MVT::v4f64 &&
15516 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15517 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
15518
15519 // If there are only inputs from one 128-bit lane, splitting will in fact be
15520 // less expensive. The flags track whether the given lane contains an element
15521 // that crosses to another lane.
15522 bool AllLanes;
15523 if (!Subtarget.hasAVX2()) {
15524 bool LaneCrossing[2] = {false, false};
15525 for (int i = 0; i < Size; ++i)
15526 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
15527 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
15528 AllLanes = LaneCrossing[0] && LaneCrossing[1];
15529 } else {
15530 bool LaneUsed[2] = {false, false};
15531 for (int i = 0; i < Size; ++i)
15532 if (Mask[i] >= 0)
15533 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
15534 AllLanes = LaneUsed[0] && LaneUsed[1];
15535 }
15536
15537 // TODO - we could support shuffling V2 in the Flipped input.
15538 assert(V2.isUndef() &&
15539 "This last part of this routine only works on single input shuffles");
15540
15541 SmallVector<int> InLaneMask;
15542 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
15543
15544 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
15545 "In-lane shuffle mask expected");
15546
15547 // If we're not using both lanes in each lane and the inlane mask is not
15548 // repeating, then we're better off splitting.
15549 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
15550 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15551 /*SimpleOnly*/ false);
15552
15553 // Flip the lanes, and shuffle the results which should now be in-lane.
15554 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
15555 SDValue Flipped = DAG.getBitcast(PVT, V1);
15556 Flipped =
15557 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
15558 Flipped = DAG.getBitcast(VT, Flipped);
15559 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
15560}
15561
15562/// Handle lowering 2-lane 128-bit shuffles.
15564 SDValue V2, ArrayRef<int> Mask,
15565 const APInt &Zeroable,
15566 const X86Subtarget &Subtarget,
15567 SelectionDAG &DAG) {
15568 if (V2.isUndef()) {
15569 // Attempt to match VBROADCAST*128 subvector broadcast load.
15570 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
15571 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
15572 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
15574 MVT MemVT = VT.getHalfNumVectorElementsVT();
15575 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
15578 VT, MemVT, Ld, Ofs, DAG))
15579 return BcstLd;
15580 }
15581
15582 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
15583 if (Subtarget.hasAVX2())
15584 return SDValue();
15585 }
15586
15587 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
15588
15589 SmallVector<int, 4> WidenedMask;
15590 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
15591 return SDValue();
15592
15593 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15594 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15595
15596 // Try to use an insert into a zero vector.
15597 if (WidenedMask[0] == 0 && IsHighZero) {
15598 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15599 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15600 DAG.getVectorIdxConstant(0, DL));
15601 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15602 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15603 DAG.getVectorIdxConstant(0, DL));
15604 }
15605
15606 // TODO: If minimizing size and one of the inputs is a zero vector and the
15607 // the zero vector has only one use, we could use a VPERM2X128 to save the
15608 // instruction bytes needed to explicitly generate the zero vector.
15609
15610 // Blends are faster and handle all the non-lane-crossing cases.
15611 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
15612 Subtarget, DAG))
15613 return Blend;
15614
15615 // If either input operand is a zero vector, use VPERM2X128 because its mask
15616 // allows us to replace the zero input with an implicit zero.
15617 if (!IsLowZero && !IsHighZero) {
15618 // Check for patterns which can be matched with a single insert of a 128-bit
15619 // subvector.
15620 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
15621 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
15622
15623 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
15624 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15626 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15627 SDValue SubVec =
15628 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
15629 DAG.getVectorIdxConstant(0, DL));
15630 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15631 DAG.getVectorIdxConstant(2, DL));
15632 }
15633 }
15634
15635 // Try to use SHUF128 if possible.
15636 if (Subtarget.hasVLX()) {
15637 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15638 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15639 ((WidenedMask[1] % 2) << 1);
15640 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15641 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15642 }
15643 }
15644 }
15645
15646 // Otherwise form a 128-bit permutation. After accounting for undefs,
15647 // convert the 64-bit shuffle mask selection values into 128-bit
15648 // selection bits by dividing the indexes by 2 and shifting into positions
15649 // defined by a vperm2*128 instruction's immediate control byte.
15650
15651 // The immediate permute control byte looks like this:
15652 // [1:0] - select 128 bits from sources for low half of destination
15653 // [2] - ignore
15654 // [3] - zero low half of destination
15655 // [5:4] - select 128 bits from sources for high half of destination
15656 // [6] - ignore
15657 // [7] - zero high half of destination
15658
15659 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15660 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15661
15662 unsigned PermMask = 0;
15663 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15664 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15665
15666 // Check the immediate mask and replace unused sources with undef.
15667 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15668 V1 = DAG.getUNDEF(VT);
15669 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15670 V2 = DAG.getUNDEF(VT);
15671
15672 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15673 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15674}
15675
15676/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15677/// shuffling each lane.
15678///
15679/// This attempts to create a repeated lane shuffle where each lane uses one
15680/// or two of the lanes of the inputs. The lanes of the input vectors are
15681/// shuffled in one or two independent shuffles to get the lanes into the
15682/// position needed by the final shuffle.
15684 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15685 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15686 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15687
15688 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15689 return SDValue();
15690
15691 int NumElts = Mask.size();
15692 int NumLanes = VT.getSizeInBits() / 128;
15693 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15694 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15695 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15696
15697 // First pass will try to fill in the RepeatMask from lanes that need two
15698 // sources.
15699 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15700 int Srcs[2] = {-1, -1};
15701 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15702 for (int i = 0; i != NumLaneElts; ++i) {
15703 int M = Mask[(Lane * NumLaneElts) + i];
15704 if (M < 0)
15705 continue;
15706 // Determine which of the possible input lanes (NumLanes from each source)
15707 // this element comes from. Assign that as one of the sources for this
15708 // lane. We can assign up to 2 sources for this lane. If we run out
15709 // sources we can't do anything.
15710 int LaneSrc = M / NumLaneElts;
15711 int Src;
15712 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15713 Src = 0;
15714 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15715 Src = 1;
15716 else
15717 return SDValue();
15718
15719 Srcs[Src] = LaneSrc;
15720 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15721 }
15722
15723 // If this lane has two sources, see if it fits with the repeat mask so far.
15724 if (Srcs[1] < 0)
15725 continue;
15726
15727 LaneSrcs[Lane][0] = Srcs[0];
15728 LaneSrcs[Lane][1] = Srcs[1];
15729
15730 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15731 assert(M1.size() == M2.size() && "Unexpected mask size");
15732 for (int i = 0, e = M1.size(); i != e; ++i)
15733 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15734 return false;
15735 return true;
15736 };
15737
15738 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15739 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15740 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15741 int M = Mask[i];
15742 if (M < 0)
15743 continue;
15744 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15745 "Unexpected mask element");
15746 MergedMask[i] = M;
15747 }
15748 };
15749
15750 if (MatchMasks(InLaneMask, RepeatMask)) {
15751 // Merge this lane mask into the final repeat mask.
15752 MergeMasks(InLaneMask, RepeatMask);
15753 continue;
15754 }
15755
15756 // Didn't find a match. Swap the operands and try again.
15757 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15759
15760 if (MatchMasks(InLaneMask, RepeatMask)) {
15761 // Merge this lane mask into the final repeat mask.
15762 MergeMasks(InLaneMask, RepeatMask);
15763 continue;
15764 }
15765
15766 // Couldn't find a match with the operands in either order.
15767 return SDValue();
15768 }
15769
15770 // Now handle any lanes with only one source.
15771 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15772 // If this lane has already been processed, skip it.
15773 if (LaneSrcs[Lane][0] >= 0)
15774 continue;
15775
15776 for (int i = 0; i != NumLaneElts; ++i) {
15777 int M = Mask[(Lane * NumLaneElts) + i];
15778 if (M < 0)
15779 continue;
15780
15781 // If RepeatMask isn't defined yet we can define it ourself.
15782 if (RepeatMask[i] < 0)
15783 RepeatMask[i] = M % NumLaneElts;
15784
15785 if (RepeatMask[i] < NumElts) {
15786 if (RepeatMask[i] != M % NumLaneElts)
15787 return SDValue();
15788 LaneSrcs[Lane][0] = M / NumLaneElts;
15789 } else {
15790 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15791 return SDValue();
15792 LaneSrcs[Lane][1] = M / NumLaneElts;
15793 }
15794 }
15795
15796 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15797 return SDValue();
15798 }
15799
15800 SmallVector<int, 16> NewMask(NumElts, -1);
15801 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15802 int Src = LaneSrcs[Lane][0];
15803 for (int i = 0; i != NumLaneElts; ++i) {
15804 int M = -1;
15805 if (Src >= 0)
15806 M = Src * NumLaneElts + i;
15807 NewMask[Lane * NumLaneElts + i] = M;
15808 }
15809 }
15810 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15811 // Ensure we didn't get back the shuffle we started with.
15812 // FIXME: This is a hack to make up for some splat handling code in
15813 // getVectorShuffle.
15814 if (isa<ShuffleVectorSDNode>(NewV1) &&
15815 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15816 return SDValue();
15817
15818 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15819 int Src = LaneSrcs[Lane][1];
15820 for (int i = 0; i != NumLaneElts; ++i) {
15821 int M = -1;
15822 if (Src >= 0)
15823 M = Src * NumLaneElts + i;
15824 NewMask[Lane * NumLaneElts + i] = M;
15825 }
15826 }
15827 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15828 // Ensure we didn't get back the shuffle we started with.
15829 // FIXME: This is a hack to make up for some splat handling code in
15830 // getVectorShuffle.
15831 if (isa<ShuffleVectorSDNode>(NewV2) &&
15832 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15833 return SDValue();
15834
15835 for (int i = 0; i != NumElts; ++i) {
15836 if (Mask[i] < 0) {
15837 NewMask[i] = -1;
15838 continue;
15839 }
15840 NewMask[i] = RepeatMask[i % NumLaneElts];
15841 if (NewMask[i] < 0)
15842 continue;
15843
15844 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15845 }
15846 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15847}
15848
15849/// If the input shuffle mask results in a vector that is undefined in all upper
15850/// or lower half elements and that mask accesses only 2 halves of the
15851/// shuffle's operands, return true. A mask of half the width with mask indexes
15852/// adjusted to access the extracted halves of the original shuffle operands is
15853/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15854/// lower half of each input operand is accessed.
15855static bool
15857 int &HalfIdx1, int &HalfIdx2) {
15858 assert((Mask.size() == HalfMask.size() * 2) &&
15859 "Expected input mask to be twice as long as output");
15860
15861 // Exactly one half of the result must be undef to allow narrowing.
15862 bool UndefLower = isUndefLowerHalf(Mask);
15863 bool UndefUpper = isUndefUpperHalf(Mask);
15864 if (UndefLower == UndefUpper)
15865 return false;
15866
15867 unsigned HalfNumElts = HalfMask.size();
15868 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15869 HalfIdx1 = -1;
15870 HalfIdx2 = -1;
15871 for (unsigned i = 0; i != HalfNumElts; ++i) {
15872 int M = Mask[i + MaskIndexOffset];
15873 if (M < 0) {
15874 HalfMask[i] = M;
15875 continue;
15876 }
15877
15878 // Determine which of the 4 half vectors this element is from.
15879 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15880 int HalfIdx = M / HalfNumElts;
15881
15882 // Determine the element index into its half vector source.
15883 int HalfElt = M % HalfNumElts;
15884
15885 // We can shuffle with up to 2 half vectors, set the new 'half'
15886 // shuffle mask accordingly.
15887 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15888 HalfMask[i] = HalfElt;
15889 HalfIdx1 = HalfIdx;
15890 continue;
15891 }
15892 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15893 HalfMask[i] = HalfElt + HalfNumElts;
15894 HalfIdx2 = HalfIdx;
15895 continue;
15896 }
15897
15898 // Too many half vectors referenced.
15899 return false;
15900 }
15901
15902 return true;
15903}
15904
15905/// Given the output values from getHalfShuffleMask(), create a half width
15906/// shuffle of extracted vectors followed by an insert back to full width.
15908 ArrayRef<int> HalfMask, int HalfIdx1,
15909 int HalfIdx2, bool UndefLower,
15910 SelectionDAG &DAG, bool UseConcat = false) {
15911 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15912 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15913
15914 MVT VT = V1.getSimpleValueType();
15915 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15916 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15917
15918 auto getHalfVector = [&](int HalfIdx) {
15919 if (HalfIdx < 0)
15920 return DAG.getUNDEF(HalfVT);
15921 SDValue V = (HalfIdx < 2 ? V1 : V2);
15922 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15923 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15924 DAG.getVectorIdxConstant(HalfIdx, DL));
15925 };
15926
15927 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15928 SDValue Half1 = getHalfVector(HalfIdx1);
15929 SDValue Half2 = getHalfVector(HalfIdx2);
15930 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15931 if (UseConcat) {
15932 SDValue Op0 = V;
15933 SDValue Op1 = DAG.getUNDEF(HalfVT);
15934 if (UndefLower)
15935 std::swap(Op0, Op1);
15936 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15937 }
15938
15939 unsigned Offset = UndefLower ? HalfNumElts : 0;
15940 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15942}
15943
15944/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15945/// This allows for fast cases such as subvector extraction/insertion
15946/// or shuffling smaller vector types which can lower more efficiently.
15948 SDValue V2, ArrayRef<int> Mask,
15949 const X86Subtarget &Subtarget,
15950 SelectionDAG &DAG) {
15951 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15952 "Expected 256-bit or 512-bit vector");
15953
15954 bool UndefLower = isUndefLowerHalf(Mask);
15955 if (!UndefLower && !isUndefUpperHalf(Mask))
15956 return SDValue();
15957
15958 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15959 "Completely undef shuffle mask should have been simplified already");
15960
15961 // Upper half is undef and lower half is whole upper subvector.
15962 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15963 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15964 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15965 if (!UndefLower &&
15966 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15967 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15968 DAG.getVectorIdxConstant(HalfNumElts, DL));
15969 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15970 DAG.getVectorIdxConstant(0, DL));
15971 }
15972
15973 // Lower half is undef and upper half is whole lower subvector.
15974 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15975 if (UndefLower &&
15976 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15977 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15978 DAG.getVectorIdxConstant(0, DL));
15979 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15980 DAG.getVectorIdxConstant(HalfNumElts, DL));
15981 }
15982
15983 int HalfIdx1, HalfIdx2;
15984 SmallVector<int, 8> HalfMask(HalfNumElts);
15985 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15986 return SDValue();
15987
15988 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15989
15990 // Only shuffle the halves of the inputs when useful.
15991 unsigned NumLowerHalves =
15992 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15993 unsigned NumUpperHalves =
15994 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15995 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15996
15997 // Determine the larger pattern of undef/halves, then decide if it's worth
15998 // splitting the shuffle based on subtarget capabilities and types.
15999 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
16000 if (!UndefLower) {
16001 // XXXXuuuu: no insert is needed.
16002 // Always extract lowers when setting lower - these are all free subreg ops.
16003 if (NumUpperHalves == 0)
16004 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16005 UndefLower, DAG);
16006
16007 if (NumUpperHalves == 1) {
16008 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
16009 if (Subtarget.hasAVX2()) {
16010 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
16011 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
16012 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
16013 (!isSingleSHUFPSMask(HalfMask) ||
16014 Subtarget.hasFastVariableCrossLaneShuffle()))
16015 return SDValue();
16016 // If this is an unary shuffle (assume that the 2nd operand is
16017 // canonicalized to undef), then we can use vpermpd. Otherwise, we
16018 // are better off extracting the upper half of 1 operand and using a
16019 // narrow shuffle.
16020 if (EltWidth == 64 && V2.isUndef())
16021 return SDValue();
16022 // If this is an unary vXi8 shuffle with inplace halves, then perform as
16023 // full width pshufb, and then merge.
16024 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
16025 return SDValue();
16026 }
16027 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16028 if (Subtarget.hasAVX512() && VT.is512BitVector())
16029 return SDValue();
16030 // Extract + narrow shuffle is better than the wide alternative.
16031 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16032 UndefLower, DAG);
16033 }
16034
16035 // Don't extract both uppers, instead shuffle and then extract.
16036 assert(NumUpperHalves == 2 && "Half vector count went wrong");
16037 return SDValue();
16038 }
16039
16040 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16041 if (NumUpperHalves == 0) {
16042 // AVX2 has efficient 64-bit element cross-lane shuffles.
16043 // TODO: Refine to account for unary shuffle, splat, and other masks?
16044 if (Subtarget.hasAVX2() && EltWidth == 64)
16045 return SDValue();
16046 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16047 if (Subtarget.hasAVX512() && VT.is512BitVector())
16048 return SDValue();
16049 // Narrow shuffle + insert is better than the wide alternative.
16050 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16051 UndefLower, DAG);
16052 }
16053
16054 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16055 return SDValue();
16056}
16057
16058/// Handle case where shuffle sources are coming from the same 128-bit lane and
16059/// every lane can be represented as the same repeating mask - allowing us to
16060/// shuffle the sources with the repeating shuffle and then permute the result
16061/// to the destination lanes.
16063 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16064 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16065 int NumElts = VT.getVectorNumElements();
16066 int NumLanes = VT.getSizeInBits() / 128;
16067 int NumLaneElts = NumElts / NumLanes;
16068
16069 // On AVX2 we may be able to just shuffle the lowest elements and then
16070 // broadcast the result.
16071 if (Subtarget.hasAVX2()) {
16072 for (unsigned BroadcastSize : {16, 32, 64}) {
16073 if (BroadcastSize <= VT.getScalarSizeInBits())
16074 continue;
16075 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16076
16077 // Attempt to match a repeating pattern every NumBroadcastElts,
16078 // accounting for UNDEFs but only references the lowest 128-bit
16079 // lane of the inputs.
16080 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16081 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16082 for (int j = 0; j != NumBroadcastElts; ++j) {
16083 int M = Mask[i + j];
16084 if (M < 0)
16085 continue;
16086 int &R = RepeatMask[j];
16087 if (0 != ((M % NumElts) / NumLaneElts))
16088 return false;
16089 if (0 <= R && R != M)
16090 return false;
16091 R = M;
16092 }
16093 return true;
16094 };
16095
16096 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16097 if (!FindRepeatingBroadcastMask(RepeatMask))
16098 continue;
16099
16100 // Shuffle the (lowest) repeated elements in place for broadcast.
16101 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16102
16103 // Shuffle the actual broadcast.
16104 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16105 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16106 for (int j = 0; j != NumBroadcastElts; ++j)
16107 BroadcastMask[i + j] = j;
16108
16109 // Avoid returning the same shuffle operation. For example,
16110 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
16111 if (BroadcastMask == Mask)
16112 return SDValue();
16113
16114 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16115 BroadcastMask);
16116 }
16117 }
16118
16119 // Bail if the shuffle mask doesn't cross 128-bit lanes.
16120 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16121 return SDValue();
16122
16123 // Bail if we already have a repeated lane shuffle mask.
16124 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16125 return SDValue();
16126
16127 // Helper to look for repeated mask in each split sublane, and that those
16128 // sublanes can then be permuted into place.
16129 auto ShuffleSubLanes = [&](int SubLaneScale) {
16130 int NumSubLanes = NumLanes * SubLaneScale;
16131 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16132
16133 // Check that all the sources are coming from the same lane and see if we
16134 // can form a repeating shuffle mask (local to each sub-lane). At the same
16135 // time, determine the source sub-lane for each destination sub-lane.
16136 int TopSrcSubLane = -1;
16137 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16138 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
16139 SubLaneScale,
16140 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
16141
16142 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16143 // Extract the sub-lane mask, check that it all comes from the same lane
16144 // and normalize the mask entries to come from the first lane.
16145 int SrcLane = -1;
16146 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16147 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16148 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16149 if (M < 0)
16150 continue;
16151 int Lane = (M % NumElts) / NumLaneElts;
16152 if ((0 <= SrcLane) && (SrcLane != Lane))
16153 return SDValue();
16154 SrcLane = Lane;
16155 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16156 SubLaneMask[Elt] = LocalM;
16157 }
16158
16159 // Whole sub-lane is UNDEF.
16160 if (SrcLane < 0)
16161 continue;
16162
16163 // Attempt to match against the candidate repeated sub-lane masks.
16164 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16165 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16166 for (int i = 0; i != NumSubLaneElts; ++i) {
16167 if (M1[i] < 0 || M2[i] < 0)
16168 continue;
16169 if (M1[i] != M2[i])
16170 return false;
16171 }
16172 return true;
16173 };
16174
16175 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16176 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16177 continue;
16178
16179 // Merge the sub-lane mask into the matching repeated sub-lane mask.
16180 for (int i = 0; i != NumSubLaneElts; ++i) {
16181 int M = SubLaneMask[i];
16182 if (M < 0)
16183 continue;
16184 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16185 "Unexpected mask element");
16186 RepeatedSubLaneMask[i] = M;
16187 }
16188
16189 // Track the top most source sub-lane - by setting the remaining to
16190 // UNDEF we can greatly simplify shuffle matching.
16191 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16192 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16193 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16194 break;
16195 }
16196
16197 // Bail if we failed to find a matching repeated sub-lane mask.
16198 if (Dst2SrcSubLanes[DstSubLane] < 0)
16199 return SDValue();
16200 }
16201 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16202 "Unexpected source lane");
16203
16204 // Create a repeating shuffle mask for the entire vector.
16205 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16206 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16207 int Lane = SubLane / SubLaneScale;
16208 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16209 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16210 int M = RepeatedSubLaneMask[Elt];
16211 if (M < 0)
16212 continue;
16213 int Idx = (SubLane * NumSubLaneElts) + Elt;
16214 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16215 }
16216 }
16217
16218 // Shuffle each source sub-lane to its destination.
16219 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16220 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16221 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16222 if (SrcSubLane < 0)
16223 continue;
16224 for (int j = 0; j != NumSubLaneElts; ++j)
16225 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16226 }
16227
16228 // Avoid returning the same shuffle operation.
16229 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
16230 if (RepeatedMask == Mask || SubLaneMask == Mask)
16231 return SDValue();
16232
16233 SDValue RepeatedShuffle =
16234 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16235
16236 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16237 SubLaneMask);
16238 };
16239
16240 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16241 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
16242 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
16243 // Otherwise we can only permute whole 128-bit lanes.
16244 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
16245 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
16246 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
16247 MinSubLaneScale = 2;
16248 MaxSubLaneScale =
16249 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
16250 }
16251 if (Subtarget.hasBWI() && VT == MVT::v64i8)
16252 MinSubLaneScale = MaxSubLaneScale = 4;
16253
16254 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
16255 if (SDValue Shuffle = ShuffleSubLanes(Scale))
16256 return Shuffle;
16257
16258 return SDValue();
16259}
16260
16262 bool &ForceV1Zero, bool &ForceV2Zero,
16263 unsigned &ShuffleImm, ArrayRef<int> Mask,
16264 const APInt &Zeroable) {
16265 int NumElts = VT.getVectorNumElements();
16266 assert(VT.getScalarSizeInBits() == 64 &&
16267 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16268 "Unexpected data type for VSHUFPD");
16269 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
16270 "Illegal shuffle mask");
16271
16272 bool ZeroLane[2] = { true, true };
16273 for (int i = 0; i < NumElts; ++i)
16274 ZeroLane[i & 1] &= Zeroable[i];
16275
16276 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
16277 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
16278 bool IsSHUFPD = true;
16279 bool IsCommutable = true;
16280 SmallVector<int, 8> SHUFPDMask(NumElts, -1);
16281 for (int i = 0; i < NumElts; ++i) {
16282 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16283 continue;
16284 if (Mask[i] < 0)
16285 return false;
16286 int Val = (i & 6) + NumElts * (i & 1);
16287 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16288 if (Mask[i] < Val || Mask[i] > Val + 1)
16289 IsSHUFPD = false;
16290 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16291 IsCommutable = false;
16292 SHUFPDMask[i] = Mask[i] % 2;
16293 }
16294
16295 if (!IsSHUFPD && !IsCommutable)
16296 return false;
16297
16298 if (!IsSHUFPD && IsCommutable)
16299 std::swap(V1, V2);
16300
16301 ForceV1Zero = ZeroLane[0];
16302 ForceV2Zero = ZeroLane[1];
16303 ShuffleImm = getSHUFPDImm(SHUFPDMask);
16304 return true;
16305}
16306
16308 SDValue V2, ArrayRef<int> Mask,
16309 const APInt &Zeroable,
16310 const X86Subtarget &Subtarget,
16311 SelectionDAG &DAG) {
16312 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16313 "Unexpected data type for VSHUFPD");
16314
16315 unsigned Immediate = 0;
16316 bool ForceV1Zero = false, ForceV2Zero = false;
16317 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16318 Mask, Zeroable))
16319 return SDValue();
16320
16321 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16322 if (ForceV1Zero)
16323 V1 = getZeroVector(VT, Subtarget, DAG, DL);
16324 if (ForceV2Zero)
16325 V2 = getZeroVector(VT, Subtarget, DAG, DL);
16326
16327 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16328 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16329}
16330
16331// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16332// by zeroable elements in the remaining 24 elements. Turn this into two
16333// vmovqb instructions shuffled together.
16335 SDValue V1, SDValue V2,
16336 ArrayRef<int> Mask,
16337 const APInt &Zeroable,
16338 SelectionDAG &DAG) {
16339 assert(VT == MVT::v32i8 && "Unexpected type!");
16340
16341 // The first 8 indices should be every 8th element.
16342 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16343 return SDValue();
16344
16345 // Remaining elements need to be zeroable.
16346 if (Zeroable.countl_one() < (Mask.size() - 8))
16347 return SDValue();
16348
16349 V1 = DAG.getBitcast(MVT::v4i64, V1);
16350 V2 = DAG.getBitcast(MVT::v4i64, V2);
16351
16352 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16353 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16354
16355 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16356 // the upper bits of the result using an unpckldq.
16357 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16358 { 0, 1, 2, 3, 16, 17, 18, 19,
16359 4, 5, 6, 7, 20, 21, 22, 23 });
16360 // Insert the unpckldq into a zero vector to widen to v32i8.
16361 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16362 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16363 DAG.getVectorIdxConstant(0, DL));
16364}
16365
16366// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
16367// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
16368// =>
16369// ul = unpckl v1, v2
16370// uh = unpckh v1, v2
16371// a = vperm ul, uh
16372// b = vperm ul, uh
16373//
16374// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16375// and permute. We cannot directly match v3 because it is split into two
16376// 256-bit vectors in earlier isel stages. Therefore, this function matches a
16377// pair of 256-bit shuffles and makes sure the masks are consecutive.
16378//
16379// Once unpck and permute nodes are created, the permute corresponding to this
16380// shuffle is returned, while the other permute replaces the other half of the
16381// shuffle in the selection dag.
16383 SDValue V1, SDValue V2,
16384 ArrayRef<int> Mask,
16385 SelectionDAG &DAG) {
16386 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16387 VT != MVT::v32i8)
16388 return SDValue();
16389 // <B0, B1, B0+1, B1+1, ..., >
16390 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
16391 unsigned Begin1) {
16392 size_t Size = Mask.size();
16393 assert(Size % 2 == 0 && "Expected even mask size");
16394 for (unsigned I = 0; I < Size; I += 2) {
16395 if (Mask[I] != (int)(Begin0 + I / 2) ||
16396 Mask[I + 1] != (int)(Begin1 + I / 2))
16397 return false;
16398 }
16399 return true;
16400 };
16401 // Check which half is this shuffle node
16402 int NumElts = VT.getVectorNumElements();
16403 size_t FirstQtr = NumElts / 2;
16404 size_t ThirdQtr = NumElts + NumElts / 2;
16405 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16406 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16407 if (!IsFirstHalf && !IsSecondHalf)
16408 return SDValue();
16409
16410 // Find the intersection between shuffle users of V1 and V2.
16411 SmallVector<SDNode *, 2> Shuffles;
16412 for (SDNode *User : V1->users())
16413 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
16414 User->getOperand(1) == V2)
16415 Shuffles.push_back(User);
16416 // Limit user size to two for now.
16417 if (Shuffles.size() != 2)
16418 return SDValue();
16419 // Find out which half of the 512-bit shuffles is each smaller shuffle
16420 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
16421 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
16422 SDNode *FirstHalf;
16423 SDNode *SecondHalf;
16424 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16425 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16426 FirstHalf = Shuffles[0];
16427 SecondHalf = Shuffles[1];
16428 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16429 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16430 FirstHalf = Shuffles[1];
16431 SecondHalf = Shuffles[0];
16432 } else {
16433 return SDValue();
16434 }
16435 // Lower into unpck and perm. Return the perm of this shuffle and replace
16436 // the other.
16437 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
16438 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
16439 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16440 DAG.getTargetConstant(0x20, DL, MVT::i8));
16441 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16442 DAG.getTargetConstant(0x31, DL, MVT::i8));
16443 if (IsFirstHalf) {
16444 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
16445 return Perm1;
16446 }
16447 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
16448 return Perm2;
16449}
16450
16451/// Handle lowering of 4-lane 64-bit floating point shuffles.
16452///
16453/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16454/// isn't available.
16456 const APInt &Zeroable, SDValue V1, SDValue V2,
16457 const X86Subtarget &Subtarget,
16458 SelectionDAG &DAG) {
16459 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16460 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16461 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16462
16463 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16464 Subtarget, DAG))
16465 return V;
16466
16467 if (V2.isUndef()) {
16468 // Check for being able to broadcast a single element.
16469 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16470 Mask, Subtarget, DAG))
16471 return Broadcast;
16472
16473 // Use low duplicate instructions for masks that match their pattern.
16474 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16475 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16476
16477 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16478 // Non-half-crossing single input shuffles can be lowered with an
16479 // interleaved permutation.
16480 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16481 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16482 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16483 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16484 }
16485
16486 // With AVX2 we have direct support for this permutation.
16487 if (Subtarget.hasAVX2())
16488 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16489 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16490
16491 // Try to create an in-lane repeating shuffle mask and then shuffle the
16492 // results into the target lanes.
16494 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16495 return V;
16496
16497 // Try to permute the lanes and then use a per-lane permute.
16498 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16499 Mask, DAG, Subtarget))
16500 return V;
16501
16502 // Otherwise, fall back.
16503 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16504 DAG, Subtarget);
16505 }
16506
16507 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16508 Zeroable, Subtarget, DAG))
16509 return Blend;
16510
16511 // Use dedicated unpack instructions for masks that match their pattern.
16512 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, V1, V2, Mask, DAG))
16513 return V;
16514
16515 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16516 Zeroable, Subtarget, DAG))
16517 return Op;
16518
16519 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16520 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16521 bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask);
16522 bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask);
16523
16524 // If we have lane crossing shuffles AND they don't all come from the lower
16525 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16526 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16527 // canonicalize to a blend of splat which isn't necessary for this combine.
16528 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16529 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16530 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16531 (V2.getOpcode() != ISD::BUILD_VECTOR) &&
16532 (!Subtarget.hasAVX2() ||
16533 !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
16534 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
16535
16536 // If we have one input in place, then we can permute the other input and
16537 // blend the result.
16538 if (V1IsInPlace || V2IsInPlace)
16539 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16540 Zeroable, Subtarget, DAG);
16541
16542 // Try to create an in-lane repeating shuffle mask and then shuffle the
16543 // results into the target lanes.
16545 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16546 return V;
16547
16548 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16549 // shuffle. However, if we have AVX2 and either inputs are already in place,
16550 // we will be able to shuffle even across lanes the other input in a single
16551 // instruction so skip this pattern.
16552 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
16554 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16555 return V;
16556
16557 // If we have VLX support, we can use VEXPAND.
16558 if (Subtarget.hasVLX())
16559 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
16560 Zeroable, Subtarget, DAG))
16561 return V;
16562
16563 // If we have AVX2 then we always want to lower with a blend because an v4 we
16564 // can fully permute the elements.
16565 if (Subtarget.hasAVX2())
16566 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16567 Zeroable, Subtarget, DAG);
16568
16569 // Otherwise fall back on generic lowering.
16570 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16571 Subtarget, DAG);
16572}
16573
16574/// Handle lowering of 4-lane 64-bit integer shuffles.
16575///
16576/// This routine is only called when we have AVX2 and thus a reasonable
16577/// instruction set for v4i64 shuffling..
16579 const APInt &Zeroable, SDValue V1, SDValue V2,
16580 const X86Subtarget &Subtarget,
16581 SelectionDAG &DAG) {
16582 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16583 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16584 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16585 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
16586
16587 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16588 Subtarget, DAG))
16589 return V;
16590
16591 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16592 Zeroable, Subtarget, DAG))
16593 return Blend;
16594
16595 // Check for being able to broadcast a single element.
16596 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16597 Subtarget, DAG))
16598 return Broadcast;
16599
16600 // Try to use shift instructions if fast.
16601 if (Subtarget.preferLowerShuffleAsShift())
16602 if (SDValue Shift =
16603 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16604 Subtarget, DAG, /*BitwiseOnly*/ true))
16605 return Shift;
16606
16607 if (V2.isUndef()) {
16608 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16609 // can use lower latency instructions that will operate on both lanes.
16610 SmallVector<int, 2> RepeatedMask;
16611 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16612 SmallVector<int, 4> PSHUFDMask;
16613 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
16614 return DAG.getBitcast(
16615 MVT::v4i64,
16616 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16617 DAG.getBitcast(MVT::v8i32, V1),
16618 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16619 }
16620
16621 // AVX2 provides a direct instruction for permuting a single input across
16622 // lanes.
16623 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16624 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16625 }
16626
16627 // Try to use shift instructions.
16628 if (SDValue Shift =
16629 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
16630 DAG, /*BitwiseOnly*/ false))
16631 return Shift;
16632
16633 // If we have VLX support, we can use VALIGN or VEXPAND.
16634 if (Subtarget.hasVLX()) {
16635 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16636 Zeroable, Subtarget, DAG))
16637 return Rotate;
16638
16639 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
16640 Zeroable, Subtarget, DAG))
16641 return V;
16642 }
16643
16644 // Try to use PALIGNR.
16645 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16646 Subtarget, DAG))
16647 return Rotate;
16648
16649 // Use dedicated unpack instructions for masks that match their pattern.
16650 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, V1, V2, Mask, DAG))
16651 return V;
16652
16653 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16654 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16655
16656 // If we have one input in place, then we can permute the other input and
16657 // blend the result.
16658 if (V1IsInPlace || V2IsInPlace)
16659 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16660 Zeroable, Subtarget, DAG);
16661
16662 // Try to create an in-lane repeating shuffle mask and then shuffle the
16663 // results into the target lanes.
16665 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16666 return V;
16667
16668 // Try to lower to PERMQ(BLENDD(V1,V2)).
16669 if (SDValue V =
16670 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16671 return V;
16672
16673 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16674 // shuffle. However, if we have AVX2 and either inputs are already in place,
16675 // we will be able to shuffle even across lanes the other input in a single
16676 // instruction so skip this pattern.
16677 if (!V1IsInPlace && !V2IsInPlace)
16679 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16680 return Result;
16681
16682 // Otherwise fall back on generic blend lowering.
16683 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16684 Zeroable, Subtarget, DAG);
16685}
16686
16687/// Handle lowering of 8-lane 32-bit floating point shuffles.
16688///
16689/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16690/// isn't available.
16692 const APInt &Zeroable, SDValue V1, SDValue V2,
16693 const X86Subtarget &Subtarget,
16694 SelectionDAG &DAG) {
16695 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16696 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16697 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16698
16699 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16700 Zeroable, Subtarget, DAG))
16701 return Blend;
16702
16703 // Check for being able to broadcast a single element.
16704 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16705 Subtarget, DAG))
16706 return Broadcast;
16707
16708 if (!Subtarget.hasAVX2()) {
16709 SmallVector<int> InLaneMask;
16710 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16711
16712 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16713 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16714 /*SimpleOnly*/ true))
16715 return R;
16716 }
16717 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16718 Zeroable, Subtarget, DAG))
16719 return DAG.getBitcast(MVT::v8f32, ZExt);
16720
16721 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16722 // options to efficiently lower the shuffle.
16723 SmallVector<int, 4> RepeatedMask;
16724 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16725 assert(RepeatedMask.size() == 4 &&
16726 "Repeated masks must be half the mask width!");
16727
16728 // Use even/odd duplicate instructions for masks that match their pattern.
16729 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16730 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16731 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16732 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16733
16734 if (V2.isUndef())
16735 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16736 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16737
16738 // Use dedicated unpack instructions for masks that match their pattern.
16739 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, V1, V2, Mask, DAG))
16740 return V;
16741
16742 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16743 // have already handled any direct blends.
16744 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16745 }
16746
16747 // Try to create an in-lane repeating shuffle mask and then shuffle the
16748 // results into the target lanes.
16750 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16751 return V;
16752
16753 // If we have a single input shuffle with different shuffle patterns in the
16754 // two 128-bit lanes use the variable mask to VPERMILPS.
16755 if (V2.isUndef()) {
16756 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16757 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16758 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16759 }
16760 if (Subtarget.hasAVX2()) {
16761 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16762 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16763 }
16764 // Otherwise, fall back.
16765 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16766 DAG, Subtarget);
16767 }
16768
16769 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16770 // shuffle.
16772 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16773 return Result;
16774
16775 // If we have VLX support, we can use VEXPAND.
16776 if (Subtarget.hasVLX())
16777 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
16778 Zeroable, Subtarget, DAG))
16779 return V;
16780
16781 // Try to match an interleave of two v8f32s and lower them as unpck and
16782 // permutes using ymms. This needs to go before we try to split the vectors.
16783 // Don't attempt on AVX1 if we're likely to split vectors anyway.
16784 if ((Subtarget.hasAVX2() ||
16787 !Subtarget.hasAVX512())
16788 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
16789 Mask, DAG))
16790 return V;
16791
16792 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16793 // since after split we get a more efficient code using vpunpcklwd and
16794 // vpunpckhwd instrs than vblend.
16795 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16796 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16797 Subtarget, DAG);
16798
16799 // If we have AVX2 then we always want to lower with a blend because at v8 we
16800 // can fully permute the elements.
16801 if (Subtarget.hasAVX2())
16802 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16803 Zeroable, Subtarget, DAG);
16804
16805 // Otherwise fall back on generic lowering.
16806 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16807 Subtarget, DAG);
16808}
16809
16810/// Handle lowering of 8-lane 32-bit integer shuffles.
16811///
16812/// This routine is only called when we have AVX2 and thus a reasonable
16813/// instruction set for v8i32 shuffling..
16815 const APInt &Zeroable, SDValue V1, SDValue V2,
16816 const X86Subtarget &Subtarget,
16817 SelectionDAG &DAG) {
16818 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16819 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16820 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16821 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
16822
16823 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16824
16825 // Whenever we can lower this as a zext, that instruction is strictly faster
16826 // than any alternative. It also allows us to fold memory operands into the
16827 // shuffle in many cases.
16828 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16829 Zeroable, Subtarget, DAG))
16830 return ZExt;
16831
16832 // Try to match an interleave of two v8i32s and lower them as unpck and
16833 // permutes using ymms. This needs to go before we try to split the vectors.
16834 if (!Subtarget.hasAVX512())
16835 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
16836 Mask, DAG))
16837 return V;
16838
16839 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16840 // since after split we get a more efficient code than vblend by using
16841 // vpunpcklwd and vpunpckhwd instrs.
16842 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16843 !Subtarget.hasAVX512())
16844 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16845 Subtarget, DAG);
16846
16847 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16848 Zeroable, Subtarget, DAG))
16849 return Blend;
16850
16851 // Check for being able to broadcast a single element.
16852 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16853 Subtarget, DAG))
16854 return Broadcast;
16855
16856 // Try to use shift instructions if fast.
16857 if (Subtarget.preferLowerShuffleAsShift()) {
16858 if (SDValue Shift =
16859 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16860 Subtarget, DAG, /*BitwiseOnly*/ true))
16861 return Shift;
16862 if (NumV2Elements == 0)
16863 if (SDValue Rotate =
16864 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16865 return Rotate;
16866 }
16867
16868 // If the shuffle mask is repeated in each 128-bit lane we can use more
16869 // efficient instructions that mirror the shuffles across the two 128-bit
16870 // lanes.
16871 SmallVector<int, 4> RepeatedMask;
16872 bool Is128BitLaneRepeatedShuffle =
16873 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16874 if (Is128BitLaneRepeatedShuffle) {
16875 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16876 if (V2.isUndef())
16877 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16878 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16879
16880 // Use dedicated unpack instructions for masks that match their pattern.
16881 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, V1, V2, Mask, DAG))
16882 return V;
16883 }
16884
16885 // Try to use shift instructions.
16886 if (SDValue Shift =
16887 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16888 DAG, /*BitwiseOnly*/ false))
16889 return Shift;
16890
16891 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16892 if (SDValue Rotate =
16893 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16894 return Rotate;
16895
16896 // If we have VLX support, we can use VALIGN or EXPAND.
16897 if (Subtarget.hasVLX()) {
16898 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16899 Zeroable, Subtarget, DAG))
16900 return Rotate;
16901
16902 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
16903 Zeroable, Subtarget, DAG))
16904 return V;
16905 }
16906
16907 // Try to use byte rotation instructions.
16908 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16909 Subtarget, DAG))
16910 return Rotate;
16911
16912 // Try to create an in-lane repeating shuffle mask and then shuffle the
16913 // results into the target lanes.
16915 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16916 return V;
16917
16918 if (V2.isUndef()) {
16919 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16920 // because that should be faster than the variable permute alternatives.
16921 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, V1, V2, Mask, DAG))
16922 return V;
16923
16924 // If the shuffle patterns aren't repeated but it's a single input, directly
16925 // generate a cross-lane VPERMD instruction.
16926 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16927 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16928 }
16929
16930 // Assume that a single SHUFPS is faster than an alternative sequence of
16931 // multiple instructions (even if the CPU has a domain penalty).
16932 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16933 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16934 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16935 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16936 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16937 CastV1, CastV2, DAG);
16938 return DAG.getBitcast(MVT::v8i32, ShufPS);
16939 }
16940
16941 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16942 // shuffle.
16944 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16945 return Result;
16946
16947 // Otherwise fall back on generic blend lowering.
16948 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16949 Zeroable, Subtarget, DAG);
16950}
16951
16952/// Handle lowering of 16-lane 16-bit integer shuffles.
16953///
16954/// This routine is only called when we have AVX2 and thus a reasonable
16955/// instruction set for v16i16 shuffling..
16957 const APInt &Zeroable, SDValue V1, SDValue V2,
16958 const X86Subtarget &Subtarget,
16959 SelectionDAG &DAG) {
16960 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16961 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16962 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16963 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16964
16965 // Whenever we can lower this as a zext, that instruction is strictly faster
16966 // than any alternative. It also allows us to fold memory operands into the
16967 // shuffle in many cases.
16969 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16970 return ZExt;
16971
16972 // Check for being able to broadcast a single element.
16973 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16974 Subtarget, DAG))
16975 return Broadcast;
16976
16977 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16978 Zeroable, Subtarget, DAG))
16979 return Blend;
16980
16981 // Use dedicated unpack instructions for masks that match their pattern.
16982 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, V1, V2, Mask, DAG))
16983 return V;
16984
16985 // Use dedicated pack instructions for masks that match their pattern.
16986 if (SDValue V =
16987 lowerShuffleWithPACK(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16988 return V;
16989
16990 // Try to use lower using a truncation.
16991 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16992 Subtarget, DAG))
16993 return V;
16994
16995 // Try to use shift instructions.
16996 if (SDValue Shift =
16997 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16998 Subtarget, DAG, /*BitwiseOnly*/ false))
16999 return Shift;
17000
17001 // Try to use byte rotation instructions.
17002 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17003 Subtarget, DAG))
17004 return Rotate;
17005
17006 // Try to create an in-lane repeating shuffle mask and then shuffle the
17007 // results into the target lanes.
17009 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17010 return V;
17011
17012 if (V2.isUndef()) {
17013 // Try to use bit rotation instructions.
17014 if (SDValue Rotate =
17015 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17016 return Rotate;
17017
17018 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17019 // because that should be faster than the variable permute alternatives.
17020 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
17021 return V;
17022
17023 // There are no generalized cross-lane shuffle operations available on i16
17024 // element types.
17025 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17027 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17028 return V;
17029
17030 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17031 DAG, Subtarget);
17032 }
17033
17034 SmallVector<int, 8> RepeatedMask;
17035 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17036 // As this is a single-input shuffle, the repeated mask should be
17037 // a strictly valid v8i16 mask that we can pass through to the v8i16
17038 // lowering to handle even the v16 case.
17040 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17041 }
17042 }
17043
17044 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17045 Zeroable, Subtarget, DAG))
17046 return PSHUFB;
17047
17048 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17049 if (Subtarget.hasBWI())
17050 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17051
17052 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17053 // shuffle.
17055 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17056 return Result;
17057
17058 // Try to permute the lanes and then use a per-lane permute.
17060 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17061 return V;
17062
17063 // Try to match an interleave of two v16i16s and lower them as unpck and
17064 // permutes using ymms.
17065 if (!Subtarget.hasAVX512())
17066 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
17067 Mask, DAG))
17068 return V;
17069
17070 // Otherwise fall back on generic lowering.
17071 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17072 Subtarget, DAG);
17073}
17074
17075/// Handle lowering of 32-lane 8-bit integer shuffles.
17076///
17077/// This routine is only called when we have AVX2 and thus a reasonable
17078/// instruction set for v32i8 shuffling..
17080 const APInt &Zeroable, SDValue V1, SDValue V2,
17081 const X86Subtarget &Subtarget,
17082 SelectionDAG &DAG) {
17083 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17084 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17085 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17086 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
17087
17088 // Whenever we can lower this as a zext, that instruction is strictly faster
17089 // than any alternative. It also allows us to fold memory operands into the
17090 // shuffle in many cases.
17091 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17092 Zeroable, Subtarget, DAG))
17093 return ZExt;
17094
17095 // Check for being able to broadcast a single element.
17096 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17097 Subtarget, DAG))
17098 return Broadcast;
17099
17100 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17101 Zeroable, Subtarget, DAG))
17102 return Blend;
17103
17104 // Use dedicated unpack instructions for masks that match their pattern.
17105 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, V1, V2, Mask, DAG))
17106 return V;
17107
17108 // Use dedicated pack instructions for masks that match their pattern.
17109 if (SDValue V =
17110 lowerShuffleWithPACK(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17111 return V;
17112
17113 // Try to use lower using a truncation.
17114 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17115 Subtarget, DAG))
17116 return V;
17117
17118 // Try to use shift instructions.
17119 if (SDValue Shift =
17120 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
17121 DAG, /*BitwiseOnly*/ false))
17122 return Shift;
17123
17124 // Try to use byte rotation instructions.
17125 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17126 Subtarget, DAG))
17127 return Rotate;
17128
17129 // Try to use bit rotation instructions.
17130 if (V2.isUndef())
17131 if (SDValue Rotate =
17132 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17133 return Rotate;
17134
17135 // Try to create an in-lane repeating shuffle mask and then shuffle the
17136 // results into the target lanes.
17138 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17139 return V;
17140
17141 // There are no generalized cross-lane shuffle operations available on i8
17142 // element types.
17143 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17144 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17145 // because that should be faster than the variable permute alternatives.
17146 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
17147 return V;
17148
17150 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17151 return V;
17152
17153 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17154 DAG, Subtarget);
17155 }
17156
17157 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17158 Zeroable, Subtarget, DAG))
17159 return PSHUFB;
17160
17161 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17162 if (Subtarget.hasVBMI())
17163 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17164
17165 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17166 // shuffle.
17168 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17169 return Result;
17170
17171 // Try to permute the lanes and then use a per-lane permute.
17173 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17174 return V;
17175
17176 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17177 // by zeroable elements in the remaining 24 elements. Turn this into two
17178 // vmovqb instructions shuffled together.
17179 if (Subtarget.hasVLX())
17180 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17181 Mask, Zeroable, DAG))
17182 return V;
17183
17184 // Try to match an interleave of two v32i8s and lower them as unpck and
17185 // permutes using ymms.
17186 if (!Subtarget.hasAVX512())
17187 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
17188 Mask, DAG))
17189 return V;
17190
17191 // Otherwise fall back on generic lowering.
17192 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17193 Subtarget, DAG);
17194}
17195
17196/// High-level routine to lower various 256-bit x86 vector shuffles.
17197///
17198/// This routine either breaks down the specific type of a 256-bit x86 vector
17199/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17200/// together based on the available instructions.
17202 SDValue V1, SDValue V2, const APInt &Zeroable,
17203 const X86Subtarget &Subtarget,
17204 SelectionDAG &DAG) {
17205 // If we have a single input to the zero element, insert that into V1 if we
17206 // can do so cheaply.
17207 int NumElts = VT.getVectorNumElements();
17208 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17209
17210 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17212 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17213 return Insertion;
17214
17215 // Handle special cases where the lower or upper half is UNDEF.
17216 if (SDValue V =
17217 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17218 return V;
17219
17220 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17221 // can check for those subtargets here and avoid much of the subtarget
17222 // querying in the per-vector-type lowering routines. With AVX1 we have
17223 // essentially *zero* ability to manipulate a 256-bit vector with integer
17224 // types. Since we'll use floating point types there eventually, just
17225 // immediately cast everything to a float and operate entirely in that domain.
17226 if (VT.isInteger() && !Subtarget.hasAVX2()) {
17227 int ElementBits = VT.getScalarSizeInBits();
17228 if (ElementBits < 32) {
17229 // No floating point type available, if we can't use the bit operations
17230 // for masking/blending then decompose into 128-bit vectors.
17231 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17232 Subtarget, DAG))
17233 return V;
17234 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17235 return V;
17236 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17237 }
17238
17239 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17241 V1 = DAG.getBitcast(FpVT, V1);
17242 V2 = DAG.getBitcast(FpVT, V2);
17243 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17244 }
17245
17246 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
17247 V1 = DAG.getBitcast(MVT::v16i16, V1);
17248 V2 = DAG.getBitcast(MVT::v16i16, V2);
17249 return DAG.getBitcast(VT,
17250 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
17251 }
17252
17253 switch (VT.SimpleTy) {
17254 case MVT::v4f64:
17255 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17256 case MVT::v4i64:
17257 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17258 case MVT::v8f32:
17259 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17260 case MVT::v8i32:
17261 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17262 case MVT::v16i16:
17263 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17264 case MVT::v32i8:
17265 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17266
17267 default:
17268 llvm_unreachable("Not a valid 256-bit x86 vector type!");
17269 }
17270}
17271
17272/// Try to lower a vector shuffle as a 128-bit shuffles.
17274 const APInt &Zeroable, SDValue V1, SDValue V2,
17275 const X86Subtarget &Subtarget,
17276 SelectionDAG &DAG) {
17277 assert(VT.getScalarSizeInBits() == 64 &&
17278 "Unexpected element type size for 128bit shuffle.");
17279
17280 // To handle 256 bit vector requires VLX and most probably
17281 // function lowerV2X128VectorShuffle() is better solution.
17282 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
17283
17284 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17285 SmallVector<int, 4> Widened128Mask;
17286 if (!canWidenShuffleElements(Mask, Widened128Mask))
17287 return SDValue();
17288 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
17289
17290 // Try to use an insert into a zero vector.
17291 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17292 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17293 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17294 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17295 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17296 DAG.getVectorIdxConstant(0, DL));
17297 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17298 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17299 DAG.getVectorIdxConstant(0, DL));
17300 }
17301
17302 // Check for patterns which can be matched with a single insert of a 256-bit
17303 // subvector.
17304 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17305 if (OnlyUsesV1 ||
17306 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17307 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17308 SDValue SubVec =
17309 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17310 DAG.getVectorIdxConstant(0, DL));
17311 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17312 DAG.getVectorIdxConstant(4, DL));
17313 }
17314
17315 // See if this is an insertion of the lower 128-bits of V2 into V1.
17316 bool IsInsert = true;
17317 int V2Index = -1;
17318 for (int i = 0; i < 4; ++i) {
17319 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17320 if (Widened128Mask[i] < 0)
17321 continue;
17322
17323 // Make sure all V1 subvectors are in place.
17324 if (Widened128Mask[i] < 4) {
17325 if (Widened128Mask[i] != i) {
17326 IsInsert = false;
17327 break;
17328 }
17329 } else {
17330 // Make sure we only have a single V2 index and its the lowest 128-bits.
17331 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17332 IsInsert = false;
17333 break;
17334 }
17335 V2Index = i;
17336 }
17337 }
17338 if (IsInsert && V2Index >= 0) {
17339 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17340 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17341 DAG.getVectorIdxConstant(0, DL));
17342 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17343 }
17344
17345 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17346 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17347 // possible we at least ensure the lanes stay sequential to help later
17348 // combines.
17349 SmallVector<int, 2> Widened256Mask;
17350 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17351 Widened128Mask.clear();
17352 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17353 }
17354
17355 // Try to lower to vshuf64x2/vshuf32x4.
17356 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17357 int PermMask[4] = {-1, -1, -1, -1};
17358 // Ensure elements came from the same Op.
17359 for (int i = 0; i < 4; ++i) {
17360 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17361 if (Widened128Mask[i] < 0)
17362 continue;
17363
17364 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17365 unsigned OpIndex = i / 2;
17366 if (Ops[OpIndex].isUndef())
17367 Ops[OpIndex] = Op;
17368 else if (Ops[OpIndex] != Op)
17369 return SDValue();
17370
17371 PermMask[i] = Widened128Mask[i] % 4;
17372 }
17373
17374 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17375 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
17376}
17377
17378/// Handle lowering of 8-lane 64-bit floating point shuffles.
17380 const APInt &Zeroable, SDValue V1, SDValue V2,
17381 const X86Subtarget &Subtarget,
17382 SelectionDAG &DAG) {
17383 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17384 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17385 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17386
17387 if (V2.isUndef()) {
17388 // Use low duplicate instructions for masks that match their pattern.
17389 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17390 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17391
17392 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17393 // Non-half-crossing single input shuffles can be lowered with an
17394 // interleaved permutation.
17395 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17396 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17397 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17398 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17399 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17400 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17401 }
17402
17403 SmallVector<int, 4> RepeatedMask;
17404 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17405 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17406 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17407 }
17408
17409 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17410 V2, Subtarget, DAG))
17411 return Shuf128;
17412
17413 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, V1, V2, Mask, DAG))
17414 return Unpck;
17415
17416 // Check if the blend happens to exactly fit that of SHUFPD.
17417 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17418 Zeroable, Subtarget, DAG))
17419 return Op;
17420
17421 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
17422 Subtarget, DAG))
17423 return V;
17424
17425 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17426 Zeroable, Subtarget, DAG))
17427 return Blend;
17428
17429 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17430}
17431
17432/// Handle lowering of 16-lane 32-bit floating point shuffles.
17434 const APInt &Zeroable, SDValue V1, SDValue V2,
17435 const X86Subtarget &Subtarget,
17436 SelectionDAG &DAG) {
17437 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17438 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17439 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17440
17441 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17442 // options to efficiently lower the shuffle.
17443 SmallVector<int, 4> RepeatedMask;
17444 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17445 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17446
17447 // Use even/odd duplicate instructions for masks that match their pattern.
17448 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17449 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17450 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17451 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17452
17453 if (V2.isUndef())
17454 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17455 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17456
17457 // Use dedicated unpack instructions for masks that match their pattern.
17458 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, V1, V2, Mask, DAG))
17459 return V;
17460
17461 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17462 Zeroable, Subtarget, DAG))
17463 return Blend;
17464
17465 // Otherwise, fall back to a SHUFPS sequence.
17466 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17467 }
17468
17469 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17470 Zeroable, Subtarget, DAG))
17471 return Blend;
17472
17474 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17475 return DAG.getBitcast(MVT::v16f32, ZExt);
17476
17477 // Try to create an in-lane repeating shuffle mask and then shuffle the
17478 // results into the target lanes.
17480 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17481 return V;
17482
17483 // If we have a single input shuffle with different shuffle patterns in the
17484 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17485 if (V2.isUndef() &&
17486 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17487 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17488 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17489 }
17490
17491 // If we have AVX512F support, we can use VEXPAND.
17492 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
17493 Zeroable, Subtarget, DAG))
17494 return V;
17495
17496 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17497}
17498
17499/// Handle lowering of 8-lane 64-bit integer shuffles.
17501 const APInt &Zeroable, SDValue V1, SDValue V2,
17502 const X86Subtarget &Subtarget,
17503 SelectionDAG &DAG) {
17504 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17505 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17506 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17507
17508 // Try to use shift instructions if fast.
17509 if (Subtarget.preferLowerShuffleAsShift())
17510 if (SDValue Shift =
17511 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17512 Subtarget, DAG, /*BitwiseOnly*/ true))
17513 return Shift;
17514
17515 if (V2.isUndef()) {
17516 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17517 // can use lower latency instructions that will operate on all four
17518 // 128-bit lanes.
17519 SmallVector<int, 2> Repeated128Mask;
17520 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17521 SmallVector<int, 4> PSHUFDMask;
17522 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17523 return DAG.getBitcast(
17524 MVT::v8i64,
17525 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17526 DAG.getBitcast(MVT::v16i32, V1),
17527 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17528 }
17529
17530 SmallVector<int, 4> Repeated256Mask;
17531 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17532 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17533 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17534 }
17535
17536 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17537 V2, Subtarget, DAG))
17538 return Shuf128;
17539
17540 // Try to use shift instructions.
17541 if (SDValue Shift =
17542 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
17543 DAG, /*BitwiseOnly*/ false))
17544 return Shift;
17545
17546 // Try to use VALIGN.
17547 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17548 Zeroable, Subtarget, DAG))
17549 return Rotate;
17550
17551 // Try to use PALIGNR.
17552 if (Subtarget.hasBWI())
17553 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17554 Subtarget, DAG))
17555 return Rotate;
17556
17557 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, V1, V2, Mask, DAG))
17558 return Unpck;
17559
17560 // If we have AVX512F support, we can use VEXPAND.
17561 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17562 Subtarget, DAG))
17563 return V;
17564
17565 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17566 Zeroable, Subtarget, DAG))
17567 return Blend;
17568
17569 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17570}
17571
17572/// Handle lowering of 16-lane 32-bit integer shuffles.
17574 const APInt &Zeroable, SDValue V1, SDValue V2,
17575 const X86Subtarget &Subtarget,
17576 SelectionDAG &DAG) {
17577 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17578 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17579 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17580
17581 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
17582
17583 // Whenever we can lower this as a zext, that instruction is strictly faster
17584 // than any alternative. It also allows us to fold memory operands into the
17585 // shuffle in many cases.
17587 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17588 return ZExt;
17589
17590 // Try to use shift instructions if fast.
17591 if (Subtarget.preferLowerShuffleAsShift()) {
17592 if (SDValue Shift =
17593 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17594 Subtarget, DAG, /*BitwiseOnly*/ true))
17595 return Shift;
17596 if (NumV2Elements == 0)
17597 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
17598 Subtarget, DAG))
17599 return Rotate;
17600 }
17601
17602 // If the shuffle mask is repeated in each 128-bit lane we can use more
17603 // efficient instructions that mirror the shuffles across the four 128-bit
17604 // lanes.
17605 SmallVector<int, 4> RepeatedMask;
17606 bool Is128BitLaneRepeatedShuffle =
17607 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17608 if (Is128BitLaneRepeatedShuffle) {
17609 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17610 if (V2.isUndef())
17611 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17612 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17613
17614 // Use dedicated unpack instructions for masks that match their pattern.
17615 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, V1, V2, Mask, DAG))
17616 return V;
17617 }
17618
17619 // Try to use shift instructions.
17620 if (SDValue Shift =
17621 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17622 Subtarget, DAG, /*BitwiseOnly*/ false))
17623 return Shift;
17624
17625 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
17626 if (SDValue Rotate =
17627 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
17628 return Rotate;
17629
17630 // Try to use VALIGN.
17631 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17632 Zeroable, Subtarget, DAG))
17633 return Rotate;
17634
17635 // Try to use byte rotation instructions.
17636 if (Subtarget.hasBWI())
17637 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17638 Subtarget, DAG))
17639 return Rotate;
17640
17641 // Assume that a single SHUFPS is faster than using a permv shuffle.
17642 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17643 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17644 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17645 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17646 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17647 CastV1, CastV2, DAG);
17648 return DAG.getBitcast(MVT::v16i32, ShufPS);
17649 }
17650
17651 // Try to create an in-lane repeating shuffle mask and then shuffle the
17652 // results into the target lanes.
17654 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17655 return V;
17656
17657 // If we have AVX512F support, we can use VEXPAND.
17658 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
17659 Zeroable, Subtarget, DAG))
17660 return V;
17661
17662 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17663 Zeroable, Subtarget, DAG))
17664 return Blend;
17665
17666 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17667}
17668
17669/// Handle lowering of 32-lane 16-bit integer shuffles.
17671 const APInt &Zeroable, SDValue V1, SDValue V2,
17672 const X86Subtarget &Subtarget,
17673 SelectionDAG &DAG) {
17674 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17675 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17676 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17677 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17678
17679 // Whenever we can lower this as a zext, that instruction is strictly faster
17680 // than any alternative. It also allows us to fold memory operands into the
17681 // shuffle in many cases.
17683 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17684 return ZExt;
17685
17686 // Use dedicated unpack instructions for masks that match their pattern.
17687 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, V1, V2, Mask, DAG))
17688 return V;
17689
17690 // Use dedicated pack instructions for masks that match their pattern.
17691 if (SDValue V =
17692 lowerShuffleWithPACK(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17693 return V;
17694
17695 // Try to use shift instructions.
17696 if (SDValue Shift =
17697 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17698 Subtarget, DAG, /*BitwiseOnly*/ false))
17699 return Shift;
17700
17701 // Try to use byte rotation instructions.
17702 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17703 Subtarget, DAG))
17704 return Rotate;
17705
17706 if (V2.isUndef()) {
17707 // Try to use bit rotation instructions.
17708 if (SDValue Rotate =
17709 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17710 return Rotate;
17711
17712 SmallVector<int, 8> RepeatedMask;
17713 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17714 // As this is a single-input shuffle, the repeated mask should be
17715 // a strictly valid v8i16 mask that we can pass through to the v8i16
17716 // lowering to handle even the v32 case.
17717 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17718 RepeatedMask, Subtarget, DAG);
17719 }
17720 }
17721
17722 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17723 Zeroable, Subtarget, DAG))
17724 return Blend;
17725
17726 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17727 Zeroable, Subtarget, DAG))
17728 return PSHUFB;
17729
17730 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17731 // shuffle.
17732 if (!V2.isUndef())
17734 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17735 return Result;
17736
17737 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17738}
17739
17740/// Handle lowering of 64-lane 8-bit integer shuffles.
17742 const APInt &Zeroable, SDValue V1, SDValue V2,
17743 const X86Subtarget &Subtarget,
17744 SelectionDAG &DAG) {
17745 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17746 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17747 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17748 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17749
17750 // Whenever we can lower this as a zext, that instruction is strictly faster
17751 // than any alternative. It also allows us to fold memory operands into the
17752 // shuffle in many cases.
17754 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17755 return ZExt;
17756
17757 // Use dedicated unpack instructions for masks that match their pattern.
17758 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, V1, V2, Mask, DAG))
17759 return V;
17760
17761 // Use dedicated pack instructions for masks that match their pattern.
17762 if (SDValue V =
17763 lowerShuffleWithPACK(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17764 return V;
17765
17766 // Try to use shift instructions.
17767 if (SDValue Shift =
17768 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17769 DAG, /*BitwiseOnly*/ false))
17770 return Shift;
17771
17772 // Try to use byte rotation instructions.
17773 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17774 Subtarget, DAG))
17775 return Rotate;
17776
17777 // Try to use bit rotation instructions.
17778 if (V2.isUndef())
17779 if (SDValue Rotate =
17780 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17781 return Rotate;
17782
17783 // Lower as AND if possible.
17784 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17785 Zeroable, Subtarget, DAG))
17786 return Masked;
17787
17788 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17789 Zeroable, Subtarget, DAG))
17790 return PSHUFB;
17791
17792 // Try to create an in-lane repeating shuffle mask and then shuffle the
17793 // results into the target lanes.
17795 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17796 return V;
17797
17799 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17800 return Result;
17801
17802 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17803 Zeroable, Subtarget, DAG))
17804 return Blend;
17805
17806 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
17807 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17808 // PALIGNR will be cheaper than the second PSHUFB+OR.
17809 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
17810 Mask, Subtarget, DAG))
17811 return V;
17812
17813 // If we can't directly blend but can use PSHUFB, that will be better as it
17814 // can both shuffle and set up the inefficient blend.
17815 bool V1InUse, V2InUse;
17816 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
17817 DAG, V1InUse, V2InUse);
17818 }
17819
17820 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17821 // shuffle.
17822 if (!V2.isUndef())
17824 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17825 return Result;
17826
17827 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17828 if (Subtarget.hasVBMI())
17829 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17830
17831 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17832}
17833
17834/// High-level routine to lower various 512-bit x86 vector shuffles.
17835///
17836/// This routine either breaks down the specific type of a 512-bit x86 vector
17837/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17838/// together based on the available instructions.
17840 MVT VT, SDValue V1, SDValue V2,
17841 const APInt &Zeroable,
17842 const X86Subtarget &Subtarget,
17843 SelectionDAG &DAG) {
17844 assert(Subtarget.hasAVX512() &&
17845 "Cannot lower 512-bit vectors w/ basic ISA!");
17846
17847 // If we have a single input to the zero element, insert that into V1 if we
17848 // can do so cheaply.
17849 int NumElts = Mask.size();
17850 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17851
17852 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17854 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17855 return Insertion;
17856
17857 // Handle special cases where the lower or upper half is UNDEF.
17858 if (SDValue V =
17859 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17860 return V;
17861
17862 // Check for being able to broadcast a single element.
17863 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17864 Subtarget, DAG))
17865 return Broadcast;
17866
17867 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17868 // Try using bit ops for masking and blending before falling back to
17869 // splitting.
17870 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17871 Subtarget, DAG))
17872 return V;
17873 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17874 return V;
17875
17876 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17877 }
17878
17879 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17880 if (!Subtarget.hasBWI())
17881 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17882 /*SimpleOnly*/ false);
17883
17884 V1 = DAG.getBitcast(MVT::v32i16, V1);
17885 V2 = DAG.getBitcast(MVT::v32i16, V2);
17886 return DAG.getBitcast(VT,
17887 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
17888 }
17889
17890 // Dispatch to each element type for lowering. If we don't have support for
17891 // specific element type shuffles at 512 bits, immediately split them and
17892 // lower them. Each lowering routine of a given type is allowed to assume that
17893 // the requisite ISA extensions for that element type are available.
17894 switch (VT.SimpleTy) {
17895 case MVT::v8f64:
17896 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17897 case MVT::v16f32:
17898 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17899 case MVT::v8i64:
17900 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17901 case MVT::v16i32:
17902 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17903 case MVT::v32i16:
17904 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17905 case MVT::v64i8:
17906 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17907
17908 default:
17909 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17910 }
17911}
17912
17914 MVT VT, SDValue V1, SDValue V2,
17915 const X86Subtarget &Subtarget,
17916 SelectionDAG &DAG) {
17917 // Shuffle should be unary.
17918 if (!V2.isUndef())
17919 return SDValue();
17920
17921 int ShiftAmt = -1;
17922 int NumElts = Mask.size();
17923 for (int i = 0; i != NumElts; ++i) {
17924 int M = Mask[i];
17925 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17926 "Unexpected mask index.");
17927 if (M < 0)
17928 continue;
17929
17930 // The first non-undef element determines our shift amount.
17931 if (ShiftAmt < 0) {
17932 ShiftAmt = M - i;
17933 // Need to be shifting right.
17934 if (ShiftAmt <= 0)
17935 return SDValue();
17936 }
17937 // All non-undef elements must shift by the same amount.
17938 if (ShiftAmt != M - i)
17939 return SDValue();
17940 }
17941 assert(ShiftAmt >= 0 && "All undef?");
17942
17943 // Great we found a shift right.
17944 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17945 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17946 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17947 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17948 DAG.getVectorIdxConstant(0, DL));
17949}
17950
17951// Determine if this shuffle can be implemented with a KSHIFT instruction.
17952// Returns the shift amount if possible or -1 if not. This is a simplified
17953// version of matchShuffleAsShift.
17954static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17955 int MaskOffset, const APInt &Zeroable) {
17956 int Size = Mask.size();
17957
17958 auto CheckZeros = [&](int Shift, bool Left) {
17959 for (int j = 0; j < Shift; ++j)
17960 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17961 return false;
17962
17963 return true;
17964 };
17965
17966 auto MatchShift = [&](int Shift, bool Left) {
17967 unsigned Pos = Left ? Shift : 0;
17968 unsigned Low = Left ? 0 : Shift;
17969 unsigned Len = Size - Shift;
17970 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17971 };
17972
17973 for (int Shift = 1; Shift != Size; ++Shift)
17974 for (bool Left : {true, false})
17975 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17977 return Shift;
17978 }
17979
17980 return -1;
17981}
17982
17983
17984// Lower vXi1 vector shuffles.
17985// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17986// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17987// vector, shuffle and then truncate it back.
17989 MVT VT, SDValue V1, SDValue V2,
17990 const APInt &Zeroable,
17991 const X86Subtarget &Subtarget,
17992 SelectionDAG &DAG) {
17993 assert(Subtarget.hasAVX512() &&
17994 "Cannot lower 512-bit vectors w/o basic ISA!");
17995
17996 int NumElts = Mask.size();
17997 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17998
17999 // Try to recognize shuffles that are just padding a subvector with zeros.
18000 int SubvecElts = 0;
18001 int Src = -1;
18002 for (int i = 0; i != NumElts; ++i) {
18003 if (Mask[i] >= 0) {
18004 // Grab the source from the first valid mask. All subsequent elements need
18005 // to use this same source.
18006 if (Src < 0)
18007 Src = Mask[i] / NumElts;
18008 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18009 break;
18010 }
18011
18012 ++SubvecElts;
18013 }
18014 assert(SubvecElts != NumElts && "Identity shuffle?");
18015
18016 // Clip to a power 2.
18017 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
18018
18019 // Make sure the number of zeroable bits in the top at least covers the bits
18020 // not covered by the subvector.
18021 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
18022 assert(Src >= 0 && "Expected a source!");
18023 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18024 SDValue Extract =
18025 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2,
18026 DAG.getVectorIdxConstant(0, DL));
18027 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18028 DAG.getConstant(0, DL, VT), Extract,
18029 DAG.getVectorIdxConstant(0, DL));
18030 }
18031
18032 // Try a simple shift right with undef elements. Later we'll try with zeros.
18033 if (SDValue Shift =
18034 lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG))
18035 return Shift;
18036
18037 // Try to match KSHIFTs.
18038 unsigned Offset = 0;
18039 for (SDValue V : {V1, V2}) {
18040 unsigned Opcode;
18041 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18042 if (ShiftAmt >= 0) {
18043 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
18044 MVT WideVT = Res.getSimpleValueType();
18045 // Widened right shifts need two shifts to ensure we shift in zeroes.
18046 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18047 int WideElts = WideVT.getVectorNumElements();
18048 // Shift left to put the original vector in the MSBs of the new size.
18049 Res =
18050 DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18051 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18052 // Increase the shift amount to account for the left shift.
18053 ShiftAmt += WideElts - NumElts;
18054 }
18055
18056 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18057 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18058 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18059 DAG.getVectorIdxConstant(0, DL));
18060 }
18061 Offset += NumElts; // Increment for next iteration.
18062 }
18063
18064 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
18065 // ops instead.
18066 // TODO: What other unary shuffles would benefit from this?
18067 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
18068 SDValue Op0 = V1.getOperand(0);
18069 SDValue Op1 = V1.getOperand(1);
18071 EVT OpVT = Op0.getValueType();
18072 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
18073 return DAG.getSetCC(
18074 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
18075 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
18076 }
18077
18078 MVT ExtVT;
18079 switch (VT.SimpleTy) {
18080 default:
18081 llvm_unreachable("Expected a vector of i1 elements");
18082 case MVT::v2i1:
18083 ExtVT = MVT::v2i64;
18084 break;
18085 case MVT::v4i1:
18086 ExtVT = MVT::v4i32;
18087 break;
18088 case MVT::v8i1:
18089 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18090 // shuffle.
18091 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18092 break;
18093 case MVT::v16i1:
18094 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18095 // 256-bit operation available.
18096 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18097 break;
18098 case MVT::v32i1:
18099 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18100 // 256-bit operation available.
18101 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
18102 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18103 break;
18104 case MVT::v64i1:
18105 // Fall back to scalarization. FIXME: We can do better if the shuffle
18106 // can be partitioned cleanly.
18107 if (!Subtarget.useBWIRegs())
18108 return SDValue();
18109 ExtVT = MVT::v64i8;
18110 break;
18111 }
18112
18113 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18114 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18115
18116 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18117 // i1 was sign extended we can use X86ISD::CVT2MASK.
18118 int NumElems = VT.getVectorNumElements();
18119 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18120 (Subtarget.hasDQI() && (NumElems < 32)))
18121 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18122 Shuffle, ISD::SETGT);
18123
18124 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18125}
18126
18127/// Helper function that returns true if the shuffle mask should be
18128/// commuted to improve canonicalization.
18130 int NumElements = Mask.size();
18131
18132 int NumV1Elements = 0, NumV2Elements = 0;
18133 for (int M : Mask)
18134 if (M < 0)
18135 continue;
18136 else if (M < NumElements)
18137 ++NumV1Elements;
18138 else
18139 ++NumV2Elements;
18140
18141 // Commute the shuffle as needed such that more elements come from V1 than
18142 // V2. This allows us to match the shuffle pattern strictly on how many
18143 // elements come from V1 without handling the symmetric cases.
18144 if (NumV2Elements > NumV1Elements)
18145 return true;
18146
18147 assert(NumV1Elements > 0 && "No V1 indices");
18148
18149 if (NumV2Elements == 0)
18150 return false;
18151
18152 // When the number of V1 and V2 elements are the same, try to minimize the
18153 // number of uses of V2 in the low half of the vector. When that is tied,
18154 // ensure that the sum of indices for V1 is equal to or lower than the sum
18155 // indices for V2. When those are equal, try to ensure that the number of odd
18156 // indices for V1 is lower than the number of odd indices for V2.
18157 if (NumV1Elements == NumV2Elements) {
18158 int LowV1Elements = 0, LowV2Elements = 0;
18159 for (int M : Mask.slice(0, NumElements / 2))
18160 if (M >= NumElements)
18161 ++LowV2Elements;
18162 else if (M >= 0)
18163 ++LowV1Elements;
18164 if (LowV2Elements > LowV1Elements)
18165 return true;
18166 if (LowV2Elements == LowV1Elements) {
18167 int SumV1Indices = 0, SumV2Indices = 0;
18168 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18169 if (Mask[i] >= NumElements)
18170 SumV2Indices += i;
18171 else if (Mask[i] >= 0)
18172 SumV1Indices += i;
18173 if (SumV2Indices < SumV1Indices)
18174 return true;
18175 if (SumV2Indices == SumV1Indices) {
18176 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18177 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18178 if (Mask[i] >= NumElements)
18179 NumV2OddIndices += i % 2;
18180 else if (Mask[i] >= 0)
18181 NumV1OddIndices += i % 2;
18182 if (NumV2OddIndices < NumV1OddIndices)
18183 return true;
18184 }
18185 }
18186 }
18187
18188 return false;
18189}
18190
18192 const X86Subtarget &Subtarget) {
18193 if (!Subtarget.hasAVX512())
18194 return false;
18195
18196 if (!V.getValueType().isSimple())
18197 return false;
18198
18199 MVT VT = V.getSimpleValueType().getScalarType();
18200 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
18201 return false;
18202
18203 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
18204 // are preferable to blendw/blendvb/masked-mov.
18205 if ((VT == MVT::i16 || VT == MVT::i8) &&
18206 V.getSimpleValueType().getSizeInBits() < 512)
18207 return false;
18208
18209 auto HasMaskOperation = [&](SDValue V) {
18210 // TODO: Currently we only check limited opcode. We probably extend
18211 // it to all binary operation by checking TLI.isBinOp().
18212 switch (V->getOpcode()) {
18213 default:
18214 return false;
18215 case ISD::ADD:
18216 case ISD::SUB:
18217 case ISD::AND:
18218 case ISD::XOR:
18219 case ISD::OR:
18220 case ISD::SMAX:
18221 case ISD::SMIN:
18222 case ISD::UMAX:
18223 case ISD::UMIN:
18224 case ISD::ABS:
18225 case ISD::SHL:
18226 case ISD::SRL:
18227 case ISD::SRA:
18228 case ISD::MUL:
18229 break;
18230 }
18231 if (!V->hasOneUse())
18232 return false;
18233
18234 return true;
18235 };
18236
18237 if (HasMaskOperation(V))
18238 return true;
18239
18240 return false;
18241}
18242
18243// Forward declaration.
18246 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
18247 const X86Subtarget &Subtarget);
18248
18249 /// Top-level lowering for x86 vector shuffles.
18250///
18251/// This handles decomposition, canonicalization, and lowering of all x86
18252/// vector shuffles. Most of the specific lowering strategies are encapsulated
18253/// above in helper routines. The canonicalization attempts to widen shuffles
18254/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18255/// s.t. only one of the two inputs needs to be tested, etc.
18257 SelectionDAG &DAG) {
18259 ArrayRef<int> OrigMask = SVOp->getMask();
18260 SDValue V1 = Op.getOperand(0);
18261 SDValue V2 = Op.getOperand(1);
18262 MVT VT = Op.getSimpleValueType();
18263 int NumElements = VT.getVectorNumElements();
18264 SDLoc DL(Op);
18265 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18266
18267 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
18268 "Can't lower MMX shuffles");
18269
18270 bool V1IsUndef = V1.isUndef();
18271 bool V2IsUndef = V2.isUndef();
18272 if (V1IsUndef && V2IsUndef)
18273 return DAG.getUNDEF(VT);
18274
18275 // When we create a shuffle node we put the UNDEF node to second operand,
18276 // but in some cases the first operand may be transformed to UNDEF.
18277 // In this case we should just commute the node.
18278 if (V1IsUndef)
18279 return DAG.getCommutedVectorShuffle(*SVOp);
18280
18281 // Check for non-undef masks pointing at an undef vector and make the masks
18282 // undef as well. This makes it easier to match the shuffle based solely on
18283 // the mask.
18284 if (V2IsUndef &&
18285 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18286 SmallVector<int, 8> NewMask(OrigMask);
18287 for (int &M : NewMask)
18288 if (M >= NumElements)
18289 M = -1;
18290 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18291 }
18292
18293 // Check for illegal shuffle mask element index values.
18294 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18295 (void)MaskUpperLimit;
18296 assert(llvm::all_of(OrigMask,
18297 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
18298 "Out of bounds shuffle index");
18299
18300 // We actually see shuffles that are entirely re-arrangements of a set of
18301 // zero inputs. This mostly happens while decomposing complex shuffles into
18302 // simple ones. Directly lower these as a buildvector of zeros.
18303 APInt KnownUndef, KnownZero;
18304 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18305
18306 APInt Zeroable = KnownUndef | KnownZero;
18307 if (Zeroable.isAllOnes())
18308 return getZeroVector(VT, Subtarget, DAG, DL);
18309
18310 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18311
18312 // Try to collapse shuffles into using a vector type with fewer elements but
18313 // wider element types. We cap this to not form integers or floating point
18314 // elements wider than 64 bits. It does not seem beneficial to form i128
18315 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18316 SmallVector<int, 16> WidenedMask;
18317 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18318 !canCombineAsMaskOperation(V1, Subtarget) &&
18319 !canCombineAsMaskOperation(V2, Subtarget) &&
18320 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18321 // Shuffle mask widening should not interfere with a broadcast opportunity
18322 // by obfuscating the operands with bitcasts.
18323 // TODO: Avoid lowering directly from this top-level function: make this
18324 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18325 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18326 Subtarget, DAG))
18327 return Broadcast;
18328
18329 MVT NewEltVT = VT.isFloatingPoint()
18332 int NewNumElts = NumElements / 2;
18333 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18334 // Make sure that the new vector type is legal. For example, v2f64 isn't
18335 // legal on SSE1.
18336 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18337 if (V2IsZero) {
18338 // Modify the new Mask to take all zeros from the all-zero vector.
18339 // Choose indices that are blend-friendly.
18340 bool UsedZeroVector = false;
18341 assert(is_contained(WidenedMask, SM_SentinelZero) &&
18342 "V2's non-undef elements are used?!");
18343 for (int i = 0; i != NewNumElts; ++i)
18344 if (WidenedMask[i] == SM_SentinelZero) {
18345 WidenedMask[i] = i + NewNumElts;
18346 UsedZeroVector = true;
18347 }
18348 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18349 // some elements to be undef.
18350 if (UsedZeroVector)
18351 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18352 }
18353 V1 = DAG.getBitcast(NewVT, V1);
18354 V2 = DAG.getBitcast(NewVT, V2);
18355 return DAG.getBitcast(
18356 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18357 }
18358 }
18359
18360 SmallVector<SDValue> Ops = {V1, V2};
18361 SmallVector<int> Mask(OrigMask);
18362
18363 // Canonicalize the shuffle with any horizontal ops inputs.
18364 // NOTE: This may update Ops and Mask.
18366 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
18367 return DAG.getBitcast(VT, HOp);
18368
18369 V1 = DAG.getBitcast(VT, Ops[0]);
18370 V2 = DAG.getBitcast(VT, Ops[1]);
18371 assert(NumElements == (int)Mask.size() &&
18372 "canonicalizeShuffleMaskWithHorizOp "
18373 "shouldn't alter the shuffle mask size");
18374
18375 // Canonicalize zeros/ones/fp splat constants to ensure no undefs.
18376 // These will be materialized uniformly anyway, so make splat matching easier.
18377 // TODO: Allow all int constants?
18378 auto CanonicalizeConstant = [VT, &DL, &DAG](SDValue V) {
18379 if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
18380 BitVector Undefs;
18381 if (SDValue Splat = BV->getSplatValue(&Undefs)) {
18382 if (Undefs.any() &&
18385 V = DAG.getBitcast(VT, DAG.getSplat(BV->getValueType(0), DL, Splat));
18386 }
18387 }
18388 }
18389 return V;
18390 };
18391 V1 = CanonicalizeConstant(V1);
18392 V2 = CanonicalizeConstant(V2);
18393
18394 // Commute the shuffle if it will improve canonicalization.
18397 std::swap(V1, V2);
18398 }
18399
18400 // For each vector width, delegate to a specialized lowering routine.
18401 if (VT.is128BitVector())
18402 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18403
18404 if (VT.is256BitVector())
18405 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18406
18407 if (VT.is512BitVector())
18408 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18409
18410 if (Is1BitVector)
18411 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18412
18413 llvm_unreachable("Unimplemented!");
18414}
18415
18416// As legal vpcompress instructions depend on various AVX512 extensions, try to
18417// convert illegal vector sizes to legal ones to avoid expansion.
18419 SelectionDAG &DAG) {
18420 assert(Subtarget.hasAVX512() &&
18421 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
18422
18423 SDLoc DL(Op);
18424 SDValue Vec = Op.getOperand(0);
18425 SDValue Mask = Op.getOperand(1);
18426 SDValue Passthru = Op.getOperand(2);
18427
18428 EVT VecVT = Vec.getValueType();
18429 EVT ElementVT = VecVT.getVectorElementType();
18430 unsigned NumElements = VecVT.getVectorNumElements();
18431 unsigned NumVecBits = VecVT.getFixedSizeInBits();
18432 unsigned NumElementBits = ElementVT.getFixedSizeInBits();
18433
18434 // 128- and 256-bit vectors with <= 16 elements can be converted to and
18435 // compressed as 512-bit vectors in AVX512F.
18436 if (NumVecBits != 128 && NumVecBits != 256)
18437 return SDValue();
18438
18439 if (NumElementBits == 32 || NumElementBits == 64) {
18440 unsigned NumLargeElements = 512 / NumElementBits;
18441 MVT LargeVecVT =
18442 MVT::getVectorVT(ElementVT.getSimpleVT(), NumLargeElements);
18443 MVT LargeMaskVT = MVT::getVectorVT(MVT::i1, NumLargeElements);
18444
18445 Vec = widenSubVector(LargeVecVT, Vec, /*ZeroNewElements=*/false, Subtarget,
18446 DAG, DL);
18447 Mask = widenSubVector(LargeMaskVT, Mask, /*ZeroNewElements=*/true,
18448 Subtarget, DAG, DL);
18449 Passthru = Passthru.isUndef() ? DAG.getUNDEF(LargeVecVT)
18450 : widenSubVector(LargeVecVT, Passthru,
18451 /*ZeroNewElements=*/false,
18452 Subtarget, DAG, DL);
18453
18454 SDValue Compressed =
18455 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18456 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Compressed,
18457 DAG.getConstant(0, DL, MVT::i64));
18458 }
18459
18460 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
18461 VecVT == MVT::v16i16) {
18462 MVT LageElementVT = MVT::getIntegerVT(512 / NumElements);
18463 EVT LargeVecVT = MVT::getVectorVT(LageElementVT, NumElements);
18464
18465 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Vec);
18466 Passthru = Passthru.isUndef()
18467 ? DAG.getUNDEF(LargeVecVT)
18468 : DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Passthru);
18469
18470 SDValue Compressed =
18471 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18472 return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);
18473 }
18474
18475 return SDValue();
18476}
18477
18478/// Try to lower a VSELECT instruction to a vector shuffle.
18480 const X86Subtarget &Subtarget,
18481 SelectionDAG &DAG) {
18482 SDValue Cond = Op.getOperand(0);
18483 SDValue LHS = Op.getOperand(1);
18484 SDValue RHS = Op.getOperand(2);
18485 MVT VT = Op.getSimpleValueType();
18486
18487 // Only non-legal VSELECTs reach this lowering, convert those into generic
18488 // shuffles and re-use the shuffle lowering path for blends.
18492 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18493 }
18494
18495 return SDValue();
18496}
18497
18498SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18499 SDValue Cond = Op.getOperand(0);
18500 SDValue LHS = Op.getOperand(1);
18501 SDValue RHS = Op.getOperand(2);
18502
18503 SDLoc dl(Op);
18504 MVT VT = Op.getSimpleValueType();
18505 if (isSoftF16(VT, Subtarget)) {
18506 MVT NVT = VT.changeVectorElementTypeToInteger();
18507 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
18508 DAG.getBitcast(NVT, LHS),
18509 DAG.getBitcast(NVT, RHS)));
18510 }
18511
18512 // A vselect where all conditions and data are constants can be optimized into
18513 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18517 return SDValue();
18518
18519 // Try to lower this to a blend-style vector shuffle. This can handle all
18520 // constant condition cases.
18521 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18522 return BlendOp;
18523
18524 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18525 // with patterns on the mask registers on AVX-512.
18526 MVT CondVT = Cond.getSimpleValueType();
18527 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18528 if (CondEltSize == 1)
18529 return Op;
18530
18531 // Variable blends are only legal from SSE4.1 onward.
18532 if (!Subtarget.hasSSE41())
18533 return SDValue();
18534
18535 unsigned EltSize = VT.getScalarSizeInBits();
18536 unsigned NumElts = VT.getVectorNumElements();
18537
18538 // Expand v32i16/v64i8 without BWI.
18539 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18540 return SDValue();
18541
18542 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18543 // into an i1 condition so that we can use the mask-based 512-bit blend
18544 // instructions.
18545 if (VT.getSizeInBits() == 512) {
18546 // Build a mask by testing the condition against zero.
18547 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18548 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18549 DAG.getConstant(0, dl, CondVT),
18550 ISD::SETNE);
18551 // Now return a new VSELECT using the mask.
18552 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18553 }
18554
18555 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18556 if (CondEltSize != EltSize) {
18557 // If we don't have a sign splat, rely on the expansion.
18558 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18559 return SDValue();
18560
18561 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18562 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18563 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18564 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18565 }
18566
18567 // v16i16/v32i8 selects without AVX2, if the condition and another operand
18568 // are free to split, then better to split before expanding the
18569 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
18570 // TODO: This is very similar to narrowVectorSelect.
18571 // TODO: Add Load splitting to isFreeToSplitVector ?
18572 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
18573 !Subtarget.hasXOP()) {
18574 bool FreeCond = isFreeToSplitVector(Cond, DAG);
18575 bool FreeLHS = isFreeToSplitVector(LHS, DAG) ||
18576 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
18577 bool FreeRHS = isFreeToSplitVector(RHS, DAG) ||
18578 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
18579 if (FreeCond && (FreeLHS || FreeRHS))
18580 return splitVectorOp(Op, DAG, dl);
18581 }
18582
18583 // Only some types will be legal on some subtargets. If we can emit a legal
18584 // VSELECT-matching blend, return Op, and but if we need to expand, return
18585 // a null value.
18586 switch (VT.SimpleTy) {
18587 default:
18588 // Most of the vector types have blends past SSE4.1.
18589 return Op;
18590
18591 case MVT::v32i8:
18592 // The byte blends for AVX vectors were introduced only in AVX2.
18593 if (Subtarget.hasAVX2())
18594 return Op;
18595
18596 return SDValue();
18597
18598 case MVT::v8i16:
18599 case MVT::v16i16:
18600 case MVT::v8f16:
18601 case MVT::v16f16: {
18602 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18603 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18604 Cond = DAG.getBitcast(CastVT, Cond);
18605 LHS = DAG.getBitcast(CastVT, LHS);
18606 RHS = DAG.getBitcast(CastVT, RHS);
18607 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18608 return DAG.getBitcast(VT, Select);
18609 }
18610 }
18611}
18612
18614 MVT VT = Op.getSimpleValueType();
18615 SDValue Vec = Op.getOperand(0);
18616 SDValue Idx = Op.getOperand(1);
18617 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
18618 SDLoc dl(Op);
18619
18621 return SDValue();
18622
18623 if (VT.getSizeInBits() == 8) {
18624 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18625 // we're going to zero extend the register or fold the store.
18628 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18629 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18630 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18631
18632 unsigned IdxVal = Idx->getAsZExtVal();
18633 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18634 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18635 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18636 }
18637
18638 if (VT == MVT::f32) {
18639 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18640 // the result back to FR32 register. It's only worth matching if the
18641 // result has a single use which is a store or a bitcast to i32. And in
18642 // the case of a store, it's not worth it if the index is a constant 0,
18643 // because a MOVSSmr can be used instead, which is smaller and faster.
18644 if (!Op.hasOneUse())
18645 return SDValue();
18646 SDNode *User = *Op.getNode()->user_begin();
18647 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18648 (User->getOpcode() != ISD::BITCAST ||
18649 User->getValueType(0) != MVT::i32))
18650 return SDValue();
18651 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18652 DAG.getBitcast(MVT::v4i32, Vec), Idx);
18653 return DAG.getBitcast(MVT::f32, Extract);
18654 }
18655
18656 if (VT == MVT::i32 || VT == MVT::i64)
18657 return Op;
18658
18659 return SDValue();
18660}
18661
18662/// Extract one bit from mask vector, like v16i1 or v8i1.
18663/// AVX-512 feature.
18665 const X86Subtarget &Subtarget) {
18666 SDValue Vec = Op.getOperand(0);
18667 SDLoc dl(Vec);
18668 MVT VecVT = Vec.getSimpleValueType();
18669 SDValue Idx = Op.getOperand(1);
18670 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18671 MVT EltVT = Op.getSimpleValueType();
18672
18673 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
18674 "Unexpected vector type in ExtractBitFromMaskVector");
18675
18676 // variable index can't be handled in mask registers,
18677 // extend vector to VR512/128
18678 if (!IdxC) {
18679 unsigned NumElts = VecVT.getVectorNumElements();
18680 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18681 // than extending to 128/256bit.
18682 if (NumElts == 1) {
18683 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18685 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
18686 }
18687 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18688 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18689 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18690 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18691 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18692 }
18693
18694 unsigned IdxVal = IdxC->getZExtValue();
18695 if (IdxVal == 0) // the operation is legal
18696 return Op;
18697
18698 // Extend to natively supported kshift.
18699 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18700
18701 // Use kshiftr instruction to move to the lower element.
18702 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18703 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18704
18705 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18706 DAG.getVectorIdxConstant(0, dl));
18707}
18708
18709// Helper to find all the extracted elements from a vector.
18711 MVT VT = N->getSimpleValueType(0);
18712 unsigned NumElts = VT.getVectorNumElements();
18713 APInt DemandedElts = APInt::getZero(NumElts);
18714 for (SDNode *User : N->users()) {
18715 switch (User->getOpcode()) {
18716 case X86ISD::PEXTRB:
18717 case X86ISD::PEXTRW:
18720 DemandedElts.setAllBits();
18721 return DemandedElts;
18722 }
18723 DemandedElts.setBit(User->getConstantOperandVal(1));
18724 break;
18725 case ISD::BITCAST: {
18726 if (!User->getValueType(0).isSimple() ||
18727 !User->getValueType(0).isVector()) {
18728 DemandedElts.setAllBits();
18729 return DemandedElts;
18730 }
18731 APInt DemandedSrcElts = getExtractedDemandedElts(User);
18732 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
18733 break;
18734 }
18735 default:
18736 DemandedElts.setAllBits();
18737 return DemandedElts;
18738 }
18739 }
18740 return DemandedElts;
18741}
18742
18743SDValue
18744X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18745 SelectionDAG &DAG) const {
18746 SDLoc dl(Op);
18747 SDValue Vec = Op.getOperand(0);
18748 MVT VecVT = Vec.getSimpleValueType();
18749 SDValue Idx = Op.getOperand(1);
18750 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18751
18752 if (VecVT.getVectorElementType() == MVT::i1)
18753 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18754
18755 if (!IdxC) {
18756 // Its more profitable to go through memory (1 cycles throughput)
18757 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
18758 // IACA tool was used to get performance estimation
18759 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18760 //
18761 // example : extractelement <16 x i8> %a, i32 %i
18762 //
18763 // Block Throughput: 3.00 Cycles
18764 // Throughput Bottleneck: Port5
18765 //
18766 // | Num Of | Ports pressure in cycles | |
18767 // | Uops | 0 - DV | 5 | 6 | 7 | |
18768 // ---------------------------------------------
18769 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18770 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18771 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18772 // Total Num Of Uops: 4
18773 //
18774 //
18775 // Block Throughput: 1.00 Cycles
18776 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18777 //
18778 // | | Ports pressure in cycles | |
18779 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18780 // ---------------------------------------------------------
18781 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18782 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18783 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18784 // Total Num Of Uops: 4
18785
18786 return SDValue();
18787 }
18788
18789 unsigned IdxVal = IdxC->getZExtValue();
18790
18791 // If this is a 256-bit vector result, first extract the 128-bit vector and
18792 // then extract the element from the 128-bit vector.
18793 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18794 // Get the 128-bit vector.
18795 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18796 MVT EltVT = VecVT.getVectorElementType();
18797
18798 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18799 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18800
18801 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18802 // this can be done with a mask.
18803 IdxVal &= ElemsPerChunk - 1;
18804 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18805 DAG.getVectorIdxConstant(IdxVal, dl));
18806 }
18807
18808 assert(VecVT.is128BitVector() && "Unexpected vector length");
18809
18810 MVT VT = Op.getSimpleValueType();
18811
18812 if (VT == MVT::i16) {
18813 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18814 // we're going to zero extend the register or fold the store (SSE41 only).
18815 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
18816 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
18817 if (Subtarget.hasFP16())
18818 return Op;
18819
18820 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18821 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18822 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18823 }
18824
18825 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18826 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18827 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18828 }
18829
18830 if (Subtarget.hasSSE41())
18831 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18832 return Res;
18833
18834 // Only extract a single element from a v16i8 source - determine the common
18835 // DWORD/WORD that all extractions share, and extract the sub-byte.
18836 // TODO: Add QWORD MOVQ extraction?
18837 if (VT == MVT::i8) {
18838 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
18839 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
18840
18841 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18842 int DWordIdx = IdxVal / 4;
18843 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18844 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18845 DAG.getBitcast(MVT::v4i32, Vec),
18846 DAG.getVectorIdxConstant(DWordIdx, dl));
18847 int ShiftVal = (IdxVal % 4) * 8;
18848 if (ShiftVal != 0)
18849 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18850 DAG.getConstant(ShiftVal, dl, MVT::i8));
18851 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18852 }
18853
18854 int WordIdx = IdxVal / 2;
18855 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18856 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18857 DAG.getBitcast(MVT::v8i16, Vec),
18858 DAG.getVectorIdxConstant(WordIdx, dl));
18859 int ShiftVal = (IdxVal % 2) * 8;
18860 if (ShiftVal != 0)
18861 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18862 DAG.getConstant(ShiftVal, dl, MVT::i8));
18863 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18864 }
18865 }
18866
18867 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
18868 if (IdxVal == 0)
18869 return Op;
18870
18871 // Shuffle the element to the lowest element, then movss or movsh.
18872 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
18873 Mask[0] = static_cast<int>(IdxVal);
18874 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18875 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18876 DAG.getVectorIdxConstant(0, dl));
18877 }
18878
18879 if (VT.getSizeInBits() == 64) {
18880 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18881 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18882 // to match extract_elt for f64.
18883 if (IdxVal == 0)
18884 return Op;
18885
18886 // UNPCKHPD the element to the lowest double word, then movsd.
18887 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18888 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18889 int Mask[2] = { 1, -1 };
18890 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18891 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18892 DAG.getVectorIdxConstant(0, dl));
18893 }
18894
18895 return SDValue();
18896}
18897
18898/// Insert one bit to mask vector, like v16i1 or v8i1.
18899/// AVX-512 feature.
18901 const X86Subtarget &Subtarget) {
18902 SDLoc dl(Op);
18903 SDValue Vec = Op.getOperand(0);
18904 SDValue Elt = Op.getOperand(1);
18905 SDValue Idx = Op.getOperand(2);
18906 MVT VecVT = Vec.getSimpleValueType();
18907
18908 if (!isa<ConstantSDNode>(Idx)) {
18909 // Non constant index. Extend source and destination,
18910 // insert element and then truncate the result.
18911 unsigned NumElts = VecVT.getVectorNumElements();
18912 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18913 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18914 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18915 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18916 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18917 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18918 }
18919
18920 // Copy into a k-register, extract to v1i1 and insert_subvector.
18921 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18922 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18923}
18924
18925SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18926 SelectionDAG &DAG) const {
18927 MVT VT = Op.getSimpleValueType();
18928 MVT EltVT = VT.getVectorElementType();
18929 unsigned NumElts = VT.getVectorNumElements();
18930 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18931
18932 if (EltVT == MVT::i1)
18933 return InsertBitToMaskVector(Op, DAG, Subtarget);
18934
18935 SDLoc dl(Op);
18936 SDValue N0 = Op.getOperand(0);
18937 SDValue N1 = Op.getOperand(1);
18938 SDValue N2 = Op.getOperand(2);
18939 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18940
18941 if (EltVT == MVT::bf16) {
18942 MVT IVT = VT.changeVectorElementTypeToInteger();
18943 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
18944 DAG.getBitcast(IVT, N0),
18945 DAG.getBitcast(MVT::i16, N1), N2);
18946 return DAG.getBitcast(VT, Res);
18947 }
18948
18949 if (!N2C) {
18950 // Variable insertion indices, usually we're better off spilling to stack,
18951 // but AVX512 can use a variable compare+select by comparing against all
18952 // possible vector indices, and FP insertion has less gpr->simd traffic.
18953 if (!(Subtarget.hasBWI() ||
18954 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18955 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18956 return SDValue();
18957
18958 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18959 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18960 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18961 return SDValue();
18962
18963 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18964 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18965 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18966
18967 SmallVector<SDValue, 16> RawIndices;
18968 for (unsigned I = 0; I != NumElts; ++I)
18969 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18970 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18971
18972 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18973 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18975 }
18976
18977 if (N2C->getAPIntValue().uge(NumElts))
18978 return SDValue();
18979 uint64_t IdxVal = N2C->getZExtValue();
18980
18981 bool IsZeroElt = X86::isZeroNode(N1);
18982 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18983
18984 if (IsZeroElt || IsAllOnesElt) {
18985 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18986 // We don't deal with i8 0 since it appears to be handled elsewhere.
18987 if (IsAllOnesElt &&
18988 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
18989 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
18990 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
18991 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
18992 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18993 CstVectorElts[IdxVal] = OnesCst;
18994 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18995 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18996 }
18997 // See if we can do this more efficiently with a blend shuffle with a
18998 // rematerializable vector.
18999 if (Subtarget.hasSSE41() &&
19000 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
19001 SmallVector<int, 8> BlendMask;
19002 for (unsigned i = 0; i != NumElts; ++i)
19003 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19004 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
19005 : getOnesVector(VT, DAG, dl);
19006 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
19007 }
19008 }
19009
19010 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
19011 // into that, and then insert the subvector back into the result.
19012 if (VT.is256BitVector() || VT.is512BitVector()) {
19013 // With a 256-bit vector, we can insert into the zero element efficiently
19014 // using a blend if we have AVX or AVX2 and the right data type.
19015 if (VT.is256BitVector() && IdxVal == 0) {
19016 // TODO: It is worthwhile to cast integer to floating point and back
19017 // and incur a domain crossing penalty if that's what we'll end up
19018 // doing anyway after extracting to a 128-bit vector.
19019 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19020 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
19021 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19022 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19023 DAG.getTargetConstant(1, dl, MVT::i8));
19024 }
19025 }
19026
19027 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19028 assert(isPowerOf2_32(NumEltsIn128) &&
19029 "Vectors will always have power-of-two number of elements.");
19030
19031 // If we are not inserting into the low 128-bit vector chunk,
19032 // then prefer the broadcast+blend sequence.
19033 // FIXME: relax the profitability check iff all N1 uses are insertions.
19034 if (IdxVal >= NumEltsIn128 &&
19035 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19036 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
19037 X86::mayFoldLoad(N1, Subtarget)))) {
19038 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
19039 SmallVector<int, 8> BlendMask;
19040 for (unsigned i = 0; i != NumElts; ++i)
19041 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19042 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
19043 }
19044
19045 // Get the desired 128-bit vector chunk.
19046 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19047
19048 // Insert the element into the desired chunk.
19049 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19050 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19051
19052 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19053 DAG.getVectorIdxConstant(IdxIn128, dl));
19054
19055 // Insert the changed part back into the bigger vector
19056 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19057 }
19058 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
19059
19060 // This will be just movw/movd/movq/movsh/movss/movsd.
19061 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19062 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19063 EltVT == MVT::f16 || EltVT == MVT::i64) {
19064 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19065 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19066 }
19067
19068 // We can't directly insert an i8 or i16 into a vector, so zero extend
19069 // it to i32 first.
19070 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19071 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19072 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19073 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19074 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19075 return DAG.getBitcast(VT, N1);
19076 }
19077 }
19078
19079 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19080 // argument. SSE41 required for pinsrb.
19081 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19082 unsigned Opc;
19083 if (VT == MVT::v8i16) {
19084 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
19086 } else {
19087 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
19088 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
19090 }
19091
19092 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
19093 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19094 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19095 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19096 }
19097
19098 if (Subtarget.hasSSE41()) {
19099 if (EltVT == MVT::f32) {
19100 // Bits [7:6] of the constant are the source select. This will always be
19101 // zero here. The DAG Combiner may combine an extract_elt index into
19102 // these bits. For example (insert (extract, 3), 2) could be matched by
19103 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19104 // Bits [5:4] of the constant are the destination select. This is the
19105 // value of the incoming immediate.
19106 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19107 // combine either bitwise AND or insert of float 0.0 to set these bits.
19108
19109 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19110 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
19111 // If this is an insertion of 32-bits into the low 32-bits of
19112 // a vector, we prefer to generate a blend with immediate rather
19113 // than an insertps. Blends are simpler operations in hardware and so
19114 // will always have equal or better performance than insertps.
19115 // But if optimizing for size and there's a load folding opportunity,
19116 // generate insertps because blendps does not have a 32-bit memory
19117 // operand form.
19118 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19119 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19120 DAG.getTargetConstant(1, dl, MVT::i8));
19121 }
19122 // Create this as a scalar to vector..
19123 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19124 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19125 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19126 }
19127
19128 // PINSR* works with constant index.
19129 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19130 return Op;
19131 }
19132
19133 return SDValue();
19134}
19135
19137 SelectionDAG &DAG) {
19138 SDLoc dl(Op);
19139 MVT OpVT = Op.getSimpleValueType();
19140
19141 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19142 // combines.
19143 if (X86::isZeroNode(Op.getOperand(0)))
19144 return getZeroVector(OpVT, Subtarget, DAG, dl);
19145
19146 // If this is a 256-bit vector result, first insert into a 128-bit
19147 // vector and then insert into the 256-bit vector.
19148 if (!OpVT.is128BitVector()) {
19149 // Insert into a 128-bit vector.
19150 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19152 OpVT.getVectorNumElements() / SizeFactor);
19153
19154 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19155
19156 // Insert the 128-bit vector.
19157 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19158 }
19159 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
19160 "Expected an SSE type!");
19161
19162 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
19163 // tblgen.
19164 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19165 return Op;
19166
19167 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19168 return DAG.getBitcast(
19169 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19170}
19171
19172// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19173// simple superregister reference or explicit instructions to insert
19174// the upper bits of a vector.
19176 SelectionDAG &DAG) {
19177 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19178
19179 return insert1BitVector(Op, DAG, Subtarget);
19180}
19181
19183 SelectionDAG &DAG) {
19184 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19185 "Only vXi1 extract_subvectors need custom lowering");
19186
19187 SDLoc dl(Op);
19188 SDValue Vec = Op.getOperand(0);
19189 uint64_t IdxVal = Op.getConstantOperandVal(1);
19190
19191 if (IdxVal == 0) // the operation is legal
19192 return Op;
19193
19194 // Extend to natively supported kshift.
19195 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
19196
19197 // Shift to the LSB.
19198 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
19199 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19200
19201 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19202 DAG.getVectorIdxConstant(0, dl));
19203}
19204
19205// Returns the appropriate wrapper opcode for a global reference.
19206unsigned X86TargetLowering::getGlobalWrapperKind(
19207 const GlobalValue *GV, const unsigned char OpFlags) const {
19208 // References to absolute symbols are never PC-relative.
19209 if (GV && GV->isAbsoluteSymbolRef())
19210 return X86ISD::Wrapper;
19211
19212 // The following OpFlags under RIP-rel PIC use RIP.
19213 if (Subtarget.isPICStyleRIPRel() &&
19214 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
19215 OpFlags == X86II::MO_DLLIMPORT))
19216 return X86ISD::WrapperRIP;
19217
19218 // GOTPCREL references must always use RIP.
19219 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
19220 return X86ISD::WrapperRIP;
19221
19222 return X86ISD::Wrapper;
19223}
19224
19225// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19226// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19227// one of the above mentioned nodes. It has to be wrapped because otherwise
19228// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19229// be used to form addressing mode. These wrapped nodes will be selected
19230// into MOV32ri.
19231SDValue
19232X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19233 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19234
19235 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19236 // global base reg.
19237 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19238
19239 auto PtrVT = getPointerTy(DAG.getDataLayout());
19241 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19242 SDLoc DL(CP);
19243 Result =
19244 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19245 // With PIC, the address is actually $g + Offset.
19246 if (OpFlag) {
19247 Result =
19248 DAG.getNode(ISD::ADD, DL, PtrVT,
19249 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19250 }
19251
19252 return Result;
19253}
19254
19255SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19256 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19257
19258 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19259 // global base reg.
19260 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19261
19262 EVT PtrVT = Op.getValueType();
19263 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19264 SDLoc DL(JT);
19265 Result =
19266 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19267
19268 // With PIC, the address is actually $g + Offset.
19269 if (OpFlag)
19270 Result =
19271 DAG.getNode(ISD::ADD, DL, PtrVT,
19272 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19273
19274 return Result;
19275}
19276
19277SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19278 SelectionDAG &DAG) const {
19279 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19280}
19281
19282SDValue
19283X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19284 // Create the TargetBlockAddressAddress node.
19285 unsigned char OpFlags =
19286 Subtarget.classifyBlockAddressReference();
19287 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19288 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19289 SDLoc dl(Op);
19290 EVT PtrVT = Op.getValueType();
19291 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19292 Result =
19293 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
19294
19295 // With PIC, the address is actually $g + Offset.
19296 if (isGlobalRelativeToPICBase(OpFlags)) {
19297 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19298 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19299 }
19300
19301 return Result;
19302}
19303
19304/// Creates target global address or external symbol nodes for calls or
19305/// other uses.
19306SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19307 bool ForCall,
19308 bool *IsImpCall) const {
19309 // Unpack the global address or external symbol.
19310 SDLoc dl(Op);
19311 const GlobalValue *GV = nullptr;
19312 int64_t Offset = 0;
19313 const char *ExternalSym = nullptr;
19314 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19315 GV = G->getGlobal();
19316 Offset = G->getOffset();
19317 } else {
19318 const auto *ES = cast<ExternalSymbolSDNode>(Op);
19319 ExternalSym = ES->getSymbol();
19320 }
19321
19322 // Calculate some flags for address lowering.
19324 unsigned char OpFlags;
19325 if (ForCall)
19326 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19327 else
19328 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19329 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19330 bool NeedsLoad = isGlobalStubReference(OpFlags);
19331
19333 EVT PtrVT = Op.getValueType();
19335
19336 if (GV) {
19337 // Create a target global address if this is a global. If possible, fold the
19338 // offset into the global address reference. Otherwise, ADD it on later.
19339 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19340 // allowed because if the address of foo is 0, the ELF R_X86_64_32
19341 // relocation will compute to a negative value, which is invalid.
19342 int64_t GlobalOffset = 0;
19343 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19345 std::swap(GlobalOffset, Offset);
19346 }
19347 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19348 } else {
19349 // If this is not a global address, this must be an external symbol.
19350 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19351 }
19352
19353 // If this is a direct call, avoid the wrapper if we don't need to do any
19354 // loads or adds. This allows SDAG ISel to match direct calls.
19355 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19356 return Result;
19357
19358 // If Import Call Optimization is enabled and this is an imported function
19359 // then make a note of it and return the global address without wrapping.
19360 if (IsImpCall && (OpFlags == X86II::MO_DLLIMPORT) &&
19361 Mod.getModuleFlag("import-call-optimization")) {
19362 assert(ForCall && "Should only enable import call optimization if we are "
19363 "lowering a call");
19364 *IsImpCall = true;
19365 return Result;
19366 }
19367
19368 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19369
19370 // With PIC, the address is actually $g + Offset.
19371 if (HasPICReg) {
19372 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19373 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19374 }
19375
19376 // For globals that require a load from a stub to get the address, emit the
19377 // load.
19378 if (NeedsLoad)
19379 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19381
19382 // If there was a non-zero offset that we didn't fold, create an explicit
19383 // addition for it.
19384 if (Offset != 0)
19385 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19386 DAG.getSignedConstant(Offset, dl, PtrVT));
19387
19388 return Result;
19389}
19390
19391SDValue
19392X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19393 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19394}
19395
19397 const EVT PtrVT, unsigned ReturnReg,
19398 unsigned char OperandFlags,
19399 bool LoadGlobalBaseReg = false,
19400 bool LocalDynamic = false) {
19402 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19403 SDLoc dl(GA);
19404 SDValue TGA;
19405 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
19406 SDValue Chain = DAG.getEntryNode();
19407 SDValue Ret;
19408 if (LocalDynamic && UseTLSDESC) {
19409 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
19410 // Reuse existing GetTLSADDR node if we can find it.
19411 if (TGA->hasOneUse()) {
19412 // TLSDESC uses TGA.
19413 SDNode *TLSDescOp = *TGA->user_begin();
19414 assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
19415 "Unexpected TLSDESC DAG");
19416 // CALLSEQ_END uses TGA via a chain and glue.
19417 auto *CallSeqEndOp = TLSDescOp->getGluedUser();
19418 assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
19419 "Unexpected TLSDESC DAG");
19420 // CopyFromReg uses CALLSEQ_END via a chain and glue.
19421 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
19422 assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
19423 "Unexpected TLSDESC DAG");
19424 Ret = SDValue(CopyFromRegOp, 0);
19425 }
19426 } else {
19427 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19428 GA->getOffset(), OperandFlags);
19429 }
19430
19431 if (!Ret) {
19432 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
19433 : LocalDynamic ? X86ISD::TLSBASEADDR
19435
19436 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19437 if (LoadGlobalBaseReg) {
19438 SDValue InGlue;
19439 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
19440 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),
19441 InGlue);
19442 InGlue = Chain.getValue(1);
19443 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
19444 } else {
19445 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});
19446 }
19447 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);
19448
19449 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19450 MFI.setHasCalls(true);
19451
19452 SDValue Glue = Chain.getValue(1);
19453 Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
19454 }
19455
19456 if (!UseTLSDESC)
19457 return Ret;
19458
19459 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
19460 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
19461
19463 SDValue Offset =
19464 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19466 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
19467}
19468
19469// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19470static SDValue
19472 const EVT PtrVT) {
19473 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
19474 /*LoadGlobalBaseReg=*/true);
19475}
19476
19477// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19478static SDValue
19480 const EVT PtrVT) {
19481 return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
19482}
19483
19484// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19485static SDValue
19487 const EVT PtrVT) {
19488 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
19489}
19490
19492 SelectionDAG &DAG, const EVT PtrVT,
19493 bool Is64Bit, bool Is64BitLP64) {
19494 SDLoc dl(GA);
19495
19496 // Get the start address of the TLS block for this module.
19500
19501 SDValue Base;
19502 if (Is64Bit) {
19503 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19504 Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,
19505 /*LoadGlobalBaseReg=*/false,
19506 /*LocalDynamic=*/true);
19507 } else {
19508 Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,
19509 /*LoadGlobalBaseReg=*/true,
19510 /*LocalDynamic=*/true);
19511 }
19512
19513 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19514 // of Base.
19515
19516 // Build x@dtpoff.
19517 unsigned char OperandFlags = X86II::MO_DTPOFF;
19518 unsigned WrapperKind = X86ISD::Wrapper;
19519 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19520 GA->getValueType(0),
19521 GA->getOffset(), OperandFlags);
19522 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19523
19524 // Add x@dtpoff with the base.
19525 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19526}
19527
19528// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19530 const EVT PtrVT, TLSModel::Model model,
19531 bool is64Bit, bool isPIC) {
19532 SDLoc dl(GA);
19533
19534 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19537
19538 SDValue ThreadPointer =
19539 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19541
19542 unsigned char OperandFlags = 0;
19543 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19544 // initialexec.
19545 unsigned WrapperKind = X86ISD::Wrapper;
19546 if (model == TLSModel::LocalExec) {
19547 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19548 } else if (model == TLSModel::InitialExec) {
19549 if (is64Bit) {
19550 OperandFlags = X86II::MO_GOTTPOFF;
19551 WrapperKind = X86ISD::WrapperRIP;
19552 } else {
19553 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19554 }
19555 } else {
19556 llvm_unreachable("Unexpected model");
19557 }
19558
19559 // emit "addl x@ntpoff,%eax" (local exec)
19560 // or "addl x@indntpoff,%eax" (initial exec)
19561 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19562 SDValue TGA =
19563 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19564 GA->getOffset(), OperandFlags);
19565 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19566
19567 if (model == TLSModel::InitialExec) {
19568 if (isPIC && !is64Bit) {
19569 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19570 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19571 Offset);
19572 }
19573
19574 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19576 }
19577
19578 // The address of the thread local variable is the add of the thread
19579 // pointer with the offset of the variable.
19580 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19581}
19582
19583SDValue
19584X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19585
19586 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19587
19588 if (DAG.getTarget().useEmulatedTLS())
19589 return LowerToTLSEmulatedModel(GA, DAG);
19590
19591 const GlobalValue *GV = GA->getGlobal();
19592 EVT PtrVT = Op.getValueType();
19593 bool PositionIndependent = isPositionIndependent();
19594
19595 if (Subtarget.isTargetELF()) {
19596 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19597 switch (model) {
19599 if (Subtarget.is64Bit()) {
19600 if (Subtarget.isTarget64BitLP64())
19601 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19602 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19603 }
19604 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19606 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19607 Subtarget.isTarget64BitLP64());
19610 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19611 PositionIndependent);
19612 }
19613 llvm_unreachable("Unknown TLS model.");
19614 }
19615
19616 if (Subtarget.isTargetDarwin()) {
19617 // Darwin only has one model of TLS. Lower to that.
19618 unsigned char OpFlag = 0;
19619 unsigned WrapperKind = 0;
19620
19621 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19622 // global base reg.
19623 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19624 if (PIC32) {
19625 OpFlag = X86II::MO_TLVP_PIC_BASE;
19626 WrapperKind = X86ISD::Wrapper;
19627 } else {
19628 OpFlag = X86II::MO_TLVP;
19629 WrapperKind = X86ISD::WrapperRIP;
19630 }
19631 SDLoc DL(Op);
19633 GA->getValueType(0),
19634 GA->getOffset(), OpFlag);
19635 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19636
19637 // With PIC32, the address is actually $g + Offset.
19638 if (PIC32)
19639 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19640 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19641 Offset);
19642
19643 // Lowering the machine isd will make sure everything is in the right
19644 // location.
19645 SDValue Chain = DAG.getEntryNode();
19646 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19647 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19648 SDValue Args[] = { Chain, Offset };
19649 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19650 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
19651
19652 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19653 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19654 MFI.setAdjustsStack(true);
19655
19656 // And our return value (tls address) is in the standard call return value
19657 // location.
19658 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19659 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19660 }
19661
19662 if (Subtarget.isOSWindows()) {
19663 // Just use the implicit TLS architecture
19664 // Need to generate something similar to:
19665 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19666 // ; from TEB
19667 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19668 // mov rcx, qword [rdx+rcx*8]
19669 // mov eax, .tls$:tlsvar
19670 // [rax+rcx] contains the address
19671 // Windows 64bit: gs:0x58
19672 // Windows 32bit: fs:__tls_array
19673
19674 SDLoc dl(GA);
19675 SDValue Chain = DAG.getEntryNode();
19676
19677 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19678 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19679 // use its literal value of 0x2C.
19681 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), X86AS::GS)
19683
19684 SDValue TlsArray = Subtarget.is64Bit()
19685 ? DAG.getIntPtrConstant(0x58, dl)
19686 : (Subtarget.isTargetWindowsGNU()
19687 ? DAG.getIntPtrConstant(0x2C, dl)
19688 : DAG.getExternalSymbol("_tls_array", PtrVT));
19689
19691 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19692
19693 SDValue res;
19695 res = ThreadPointer;
19696 } else {
19697 // Load the _tls_index variable
19698 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19699 if (Subtarget.is64Bit())
19700 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19701 MachinePointerInfo(), MVT::i32);
19702 else
19703 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19704
19705 const DataLayout &DL = DAG.getDataLayout();
19706 SDValue Scale =
19707 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19708 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19709
19710 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19711 }
19712
19713 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19714
19715 // Get the offset of start of .tls section
19716 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19717 GA->getValueType(0),
19719 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19720
19721 // The address of the thread local variable is the add of the thread
19722 // pointer with the offset of the variable.
19723 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19724 }
19725
19726 llvm_unreachable("TLS not implemented for this target.");
19727}
19728
19730 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
19731 const TargetMachine &TM = getTargetMachine();
19732 TLSModel::Model Model = TM.getTLSModel(&GV);
19733 switch (Model) {
19736 // We can include the %fs segment register in addressing modes.
19737 return true;
19740 // These models do not result in %fs relative addresses unless
19741 // TLS descriptior are used.
19742 //
19743 // Even in the case of TLS descriptors we currently have no way to model
19744 // the difference between %fs access and the computations needed for the
19745 // offset and returning `true` for TLS-desc currently duplicates both
19746 // which is detrimental :-/
19747 return false;
19748 }
19749 }
19750 return false;
19751}
19752
19753/// Lower SRA_PARTS and friends, which return two i32 values
19754/// and take a 2 x i32 value to shift plus a shift amount.
19755/// TODO: Can this be moved to general expansion code?
19757 SDValue Lo, Hi;
19758 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19759 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19760}
19761
19762// Try to use a packed vector operation to handle i64 on 32-bit targets when
19763// AVX512DQ is enabled.
19765 SelectionDAG &DAG,
19766 const X86Subtarget &Subtarget) {
19767 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19768 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19769 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19770 Op.getOpcode() == ISD::UINT_TO_FP) &&
19771 "Unexpected opcode!");
19772 bool IsStrict = Op->isStrictFPOpcode();
19773 unsigned OpNo = IsStrict ? 1 : 0;
19774 SDValue Src = Op.getOperand(OpNo);
19775 MVT SrcVT = Src.getSimpleValueType();
19776 MVT VT = Op.getSimpleValueType();
19777
19778 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19779 (VT != MVT::f32 && VT != MVT::f64))
19780 return SDValue();
19781
19782 // Pack the i64 into a vector, do the operation and extract.
19783
19784 // Using 256-bit to ensure result is 128-bits for f32 case.
19785 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19786 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19787 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19788
19789 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19790 if (IsStrict) {
19791 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19792 {Op.getOperand(0), InVec});
19793 SDValue Chain = CvtVec.getValue(1);
19794 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19795 DAG.getVectorIdxConstant(0, dl));
19796 return DAG.getMergeValues({Value, Chain}, dl);
19797 }
19798
19799 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19800
19801 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19802 DAG.getVectorIdxConstant(0, dl));
19803}
19804
19805// Try to use a packed vector operation to handle i64 on 32-bit targets.
19807 const X86Subtarget &Subtarget) {
19808 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19809 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19810 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19811 Op.getOpcode() == ISD::UINT_TO_FP) &&
19812 "Unexpected opcode!");
19813 bool IsStrict = Op->isStrictFPOpcode();
19814 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19815 MVT SrcVT = Src.getSimpleValueType();
19816 MVT VT = Op.getSimpleValueType();
19817
19818 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19819 return SDValue();
19820
19821 // Pack the i64 into a vector, do the operation and extract.
19822
19823 assert(Subtarget.hasFP16() && "Expected FP16");
19824
19825 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
19826 if (IsStrict) {
19827 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19828 {Op.getOperand(0), InVec});
19829 SDValue Chain = CvtVec.getValue(1);
19830 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19831 DAG.getVectorIdxConstant(0, dl));
19832 return DAG.getMergeValues({Value, Chain}, dl);
19833 }
19834
19835 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
19836
19837 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19838 DAG.getVectorIdxConstant(0, dl));
19839}
19840
19841static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19842 const X86Subtarget &Subtarget) {
19843 switch (Opcode) {
19844 case ISD::SINT_TO_FP:
19845 // TODO: Handle wider types with AVX/AVX512.
19846 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19847 return false;
19848 // CVTDQ2PS or (V)CVTDQ2PD
19849 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19850
19851 case ISD::UINT_TO_FP:
19852 // TODO: Handle wider types and i64 elements.
19853 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19854 return false;
19855 // VCVTUDQ2PS or VCVTUDQ2PD
19856 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19857
19858 default:
19859 return false;
19860 }
19861}
19862
19863/// Given a scalar cast operation that is extracted from a vector, try to
19864/// vectorize the cast op followed by extraction. This will avoid an expensive
19865/// round-trip between XMM and GPR.
19867 SelectionDAG &DAG,
19868 const X86Subtarget &Subtarget) {
19869 // TODO: This could be enhanced to handle smaller integer types by peeking
19870 // through an extend.
19871 SDValue Extract = Cast.getOperand(0);
19872 MVT DestVT = Cast.getSimpleValueType();
19873 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19874 !isa<ConstantSDNode>(Extract.getOperand(1)))
19875 return SDValue();
19876
19877 // See if we have a 128-bit vector cast op for this type of cast.
19878 SDValue VecOp = Extract.getOperand(0);
19879 MVT FromVT = VecOp.getSimpleValueType();
19880 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19881 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19882 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19883 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19884 return SDValue();
19885
19886 // If we are extracting from a non-zero element, first shuffle the source
19887 // vector to allow extracting from element zero.
19888 if (!isNullConstant(Extract.getOperand(1))) {
19889 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19890 Mask[0] = Extract.getConstantOperandVal(1);
19891 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19892 }
19893 // If the source vector is wider than 128-bits, extract the low part. Do not
19894 // create an unnecessarily wide vector cast op.
19895 if (FromVT != Vec128VT)
19896 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19897
19898 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19899 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19900 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19901 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19902 DAG.getVectorIdxConstant(0, DL));
19903}
19904
19905/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19906/// try to vectorize the cast ops. This will avoid an expensive round-trip
19907/// between XMM and GPR.
19908static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
19909 SelectionDAG &DAG,
19910 const X86Subtarget &Subtarget) {
19911 // TODO: Allow FP_TO_UINT.
19912 SDValue CastToInt = CastToFP.getOperand(0);
19913 MVT VT = CastToFP.getSimpleValueType();
19914 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19915 return SDValue();
19916
19917 MVT IntVT = CastToInt.getSimpleValueType();
19918 SDValue X = CastToInt.getOperand(0);
19919 MVT SrcVT = X.getSimpleValueType();
19920 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19921 return SDValue();
19922
19923 // See if we have 128-bit vector cast instructions for this type of cast.
19924 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19925 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19926 IntVT != MVT::i32)
19927 return SDValue();
19928
19929 unsigned SrcSize = SrcVT.getSizeInBits();
19930 unsigned IntSize = IntVT.getSizeInBits();
19931 unsigned VTSize = VT.getSizeInBits();
19932 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19933 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19934 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19935
19936 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19937 unsigned ToIntOpcode =
19938 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19939 unsigned ToFPOpcode =
19940 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19941
19942 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19943 //
19944 // We are not defining the high elements (for example, zero them) because
19945 // that could nullify any performance advantage that we hoped to gain from
19946 // this vector op hack. We do not expect any adverse effects (like denorm
19947 // penalties) with cast ops.
19948 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19949 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19950 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19951 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19952 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19953}
19954
19956 SelectionDAG &DAG,
19957 const X86Subtarget &Subtarget) {
19958 bool IsStrict = Op->isStrictFPOpcode();
19959 MVT VT = Op->getSimpleValueType(0);
19960 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19961
19962 if (Subtarget.hasDQI()) {
19963 assert(!Subtarget.hasVLX() && "Unexpected features");
19964
19965 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19966 Src.getSimpleValueType() == MVT::v4i64) &&
19967 "Unsupported custom type");
19968
19969 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19970 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19971 "Unexpected VT!");
19972 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19973
19974 // Need to concat with zero vector for strict fp to avoid spurious
19975 // exceptions.
19976 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19977 : DAG.getUNDEF(MVT::v8i64);
19978 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19979 DAG.getVectorIdxConstant(0, DL));
19980 SDValue Res, Chain;
19981 if (IsStrict) {
19982 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19983 {Op->getOperand(0), Src});
19984 Chain = Res.getValue(1);
19985 } else {
19986 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19987 }
19988
19989 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19990 DAG.getVectorIdxConstant(0, DL));
19991
19992 if (IsStrict)
19993 return DAG.getMergeValues({Res, Chain}, DL);
19994 return Res;
19995 }
19996
19997 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19998 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19999 if (VT != MVT::v4f32 || IsSigned)
20000 return SDValue();
20001
20002 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
20003 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
20004 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
20005 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
20006 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
20007 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
20008 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
20009 SmallVector<SDValue, 4> SignCvts(4);
20010 SmallVector<SDValue, 4> Chains(4);
20011 for (int i = 0; i != 4; ++i) {
20012 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
20013 DAG.getVectorIdxConstant(i, DL));
20014 if (IsStrict) {
20015 SignCvts[i] =
20016 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
20017 {Op.getOperand(0), Elt});
20018 Chains[i] = SignCvts[i].getValue(1);
20019 } else {
20020 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
20021 }
20022 }
20023 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20024
20025 SDValue Slow, Chain;
20026 if (IsStrict) {
20027 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20028 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20029 {Chain, SignCvt, SignCvt});
20030 Chain = Slow.getValue(1);
20031 } else {
20032 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20033 }
20034
20035 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20036 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20037
20038 if (IsStrict)
20039 return DAG.getMergeValues({Cvt, Chain}, DL);
20040
20041 return Cvt;
20042}
20043
20045 SelectionDAG &DAG) {
20046 bool IsStrict = Op->isStrictFPOpcode();
20047 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20048 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20049 MVT VT = Op.getSimpleValueType();
20050 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20051
20052 SDValue Rnd = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);
20053 if (IsStrict)
20054 return DAG.getNode(
20055 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
20056 {Chain,
20057 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
20058 Rnd});
20059 return DAG.getNode(ISD::FP_ROUND, dl, VT,
20060 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
20061}
20062
20063static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned,
20064 const X86Subtarget &Subtarget) {
20065 if (FloatVT.getScalarType() != MVT::f16 || Subtarget.hasVLX()) {
20066 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
20067 return true;
20068 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
20069 return true;
20070 }
20071 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
20072 return true;
20073 if (Subtarget.useAVX512Regs()) {
20074 if (VT == MVT::v16i32)
20075 return true;
20076 if (VT == MVT::v8i64 && FloatVT == MVT::v8f16 && Subtarget.hasFP16())
20077 return true;
20078 if (VT == MVT::v8i64 && Subtarget.hasDQI())
20079 return true;
20080 }
20081 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
20082 (VT == MVT::v2i64 || VT == MVT::v4i64))
20083 return true;
20084 return false;
20085}
20086
20087SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20088 SelectionDAG &DAG) const {
20089 bool IsStrict = Op->isStrictFPOpcode();
20090 unsigned OpNo = IsStrict ? 1 : 0;
20091 SDValue Src = Op.getOperand(OpNo);
20092 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20093 MVT SrcVT = Src.getSimpleValueType();
20094 MVT VT = Op.getSimpleValueType();
20095 SDLoc dl(Op);
20096
20097 if (isSoftF16(VT, Subtarget))
20098 return promoteXINT_TO_FP(Op, dl, DAG);
20099 else if (isLegalConversion(SrcVT, VT, true, Subtarget))
20100 return Op;
20101
20102 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20103 return LowerWin64_INT128_TO_FP(Op, DAG);
20104
20105 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20106 return Extract;
20107
20108 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
20109 return R;
20110
20111 if (SrcVT.isVector()) {
20112 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20113 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20114 // source for strict FP.
20115 if (IsStrict)
20116 return DAG.getNode(
20117 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20118 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20119 DAG.getUNDEF(SrcVT))});
20120 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20121 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20122 DAG.getUNDEF(SrcVT)));
20123 }
20124 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20125 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20126
20127 return SDValue();
20128 }
20129
20130 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
20131 "Unknown SINT_TO_FP to lower!");
20132
20133 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20134
20135 // These are really Legal; return the operand so the caller accepts it as
20136 // Legal.
20137 if (SrcVT == MVT::i32 && UseSSEReg)
20138 return Op;
20139 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20140 return Op;
20141
20142 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20143 return V;
20144 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20145 return V;
20146
20147 // SSE doesn't have an i16 conversion so we need to promote.
20148 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20149 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20150 if (IsStrict)
20151 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20152 {Chain, Ext});
20153
20154 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20155 }
20156
20157 if (VT == MVT::f128 || !Subtarget.hasX87())
20158 return SDValue();
20159
20160 SDValue ValueToStore = Src;
20161 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20162 // Bitcasting to f64 here allows us to do a single 64-bit store from
20163 // an SSE register, avoiding the store forwarding penalty that would come
20164 // with two 32-bit stores.
20165 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20166
20167 unsigned Size = SrcVT.getStoreSize();
20168 Align Alignment(Size);
20169 MachineFunction &MF = DAG.getMachineFunction();
20170 auto PtrVT = getPointerTy(MF.getDataLayout());
20171 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20172 MachinePointerInfo MPI =
20174 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20175 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20176 std::pair<SDValue, SDValue> Tmp =
20177 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20178
20179 if (IsStrict)
20180 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20181
20182 return Tmp.first;
20183}
20184
20185std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20186 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20187 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20188 // Build the FILD
20189 SDVTList Tys;
20190 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20191 if (useSSE)
20192 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20193 else
20194 Tys = DAG.getVTList(DstVT, MVT::Other);
20195
20196 SDValue FILDOps[] = {Chain, Pointer};
20197 SDValue Result =
20198 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20199 Alignment, MachineMemOperand::MOLoad);
20200 Chain = Result.getValue(1);
20201
20202 if (useSSE) {
20204 unsigned SSFISize = DstVT.getStoreSize();
20205 int SSFI =
20206 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20207 auto PtrVT = getPointerTy(MF.getDataLayout());
20208 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20209 Tys = DAG.getVTList(MVT::Other);
20210 SDValue FSTOps[] = {Chain, Result, StackSlot};
20213 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20214
20215 Chain =
20216 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20217 Result = DAG.getLoad(
20218 DstVT, DL, Chain, StackSlot,
20220 Chain = Result.getValue(1);
20221 }
20222
20223 return { Result, Chain };
20224}
20225
20226/// Horizontal vector math instructions may be slower than normal math with
20227/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20228/// implementation, and likely shuffle complexity of the alternate sequence.
20229static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20230 const X86Subtarget &Subtarget) {
20231 bool IsOptimizingSize = DAG.shouldOptForSize();
20232 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20233 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20234}
20235
20236/// 64-bit unsigned integer to double expansion.
20238 SelectionDAG &DAG,
20239 const X86Subtarget &Subtarget) {
20240 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20241 // when converting 0 when rounding toward negative infinity. Caller will
20242 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20243 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
20244 // This algorithm is not obvious. Here it is what we're trying to output:
20245 /*
20246 movq %rax, %xmm0
20247 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20248 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20249 #ifdef __SSE3__
20250 haddpd %xmm0, %xmm0
20251 #else
20252 pshufd $0x4e, %xmm0, %xmm1
20253 addpd %xmm1, %xmm0
20254 #endif
20255 */
20256
20257 LLVMContext *Context = DAG.getContext();
20258
20259 // Build some magic constants.
20260 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20261 Constant *C0 = ConstantDataVector::get(*Context, CV0);
20262 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20263 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20264
20266 CV1.push_back(
20267 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20268 APInt(64, 0x4330000000000000ULL))));
20269 CV1.push_back(
20270 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20271 APInt(64, 0x4530000000000000ULL))));
20272 Constant *C1 = ConstantVector::get(CV1);
20273 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20274
20275 // Load the 64-bit value into an XMM register.
20276 SDValue XR1 =
20277 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20278 SDValue CLod0 = DAG.getLoad(
20279 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20281 SDValue Unpck1 =
20282 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20283
20284 SDValue CLod1 = DAG.getLoad(
20285 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20287 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20288 // TODO: Are there any fast-math-flags to propagate here?
20289 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20290 SDValue Result;
20291
20292 if (Subtarget.hasSSE3() &&
20293 shouldUseHorizontalOp(true, DAG, Subtarget)) {
20294 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20295 } else {
20296 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20297 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20298 }
20299 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20300 DAG.getVectorIdxConstant(0, dl));
20301 return Result;
20302}
20303
20304/// 32-bit unsigned integer to float expansion.
20306 SelectionDAG &DAG,
20307 const X86Subtarget &Subtarget) {
20308 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20309 // FP constant to bias correct the final result.
20310 SDValue Bias = DAG.getConstantFP(
20311 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
20312
20313 // Load the 32-bit value into an XMM register.
20314 SDValue Load =
20315 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20316
20317 // Zero out the upper parts of the register.
20318 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20319
20320 // Or the load with the bias.
20321 SDValue Or = DAG.getNode(
20322 ISD::OR, dl, MVT::v2i64,
20323 DAG.getBitcast(MVT::v2i64, Load),
20324 DAG.getBitcast(MVT::v2i64,
20325 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20326 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20327 DAG.getBitcast(MVT::v2f64, Or),
20328 DAG.getVectorIdxConstant(0, dl));
20329
20330 if (Op.getNode()->isStrictFPOpcode()) {
20331 // Subtract the bias.
20332 // TODO: Are there any fast-math-flags to propagate here?
20333 SDValue Chain = Op.getOperand(0);
20334 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20335 {Chain, Or, Bias});
20336
20337 if (Op.getValueType() == Sub.getValueType())
20338 return Sub;
20339
20340 // Handle final rounding.
20341 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20342 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20343
20344 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20345 }
20346
20347 // Subtract the bias.
20348 // TODO: Are there any fast-math-flags to propagate here?
20349 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20350
20351 // Handle final rounding.
20352 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20353}
20354
20356 SelectionDAG &DAG,
20357 const X86Subtarget &Subtarget) {
20358 if (Op.getSimpleValueType() != MVT::v2f64)
20359 return SDValue();
20360
20361 bool IsStrict = Op->isStrictFPOpcode();
20362
20363 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20364 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
20365
20366 if (Subtarget.hasAVX512()) {
20367 if (!Subtarget.hasVLX()) {
20368 // Let generic type legalization widen this.
20369 if (!IsStrict)
20370 return SDValue();
20371 // Otherwise pad the integer input with 0s and widen the operation.
20372 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20373 DAG.getConstant(0, DL, MVT::v2i32));
20374 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20375 {Op.getOperand(0), N0});
20376 SDValue Chain = Res.getValue(1);
20377 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20378 DAG.getVectorIdxConstant(0, DL));
20379 return DAG.getMergeValues({Res, Chain}, DL);
20380 }
20381
20382 // Legalize to v4i32 type.
20383 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20384 DAG.getUNDEF(MVT::v2i32));
20385 if (IsStrict)
20386 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20387 {Op.getOperand(0), N0});
20388 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20389 }
20390
20391 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20392 // This gives us the floating point equivalent of 2^52 + the i32 integer
20393 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20394 // point leaving just our i32 integers in double format.
20395 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20396 SDValue VBias = DAG.getConstantFP(
20397 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
20398 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20399 DAG.getBitcast(MVT::v2i64, VBias));
20400 Or = DAG.getBitcast(MVT::v2f64, Or);
20401
20402 if (IsStrict)
20403 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20404 {Op.getOperand(0), Or, VBias});
20405 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20406}
20407
20409 SelectionDAG &DAG,
20410 const X86Subtarget &Subtarget) {
20411 bool IsStrict = Op->isStrictFPOpcode();
20412 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20413 MVT VecIntVT = V.getSimpleValueType();
20414 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20415 "Unsupported custom type");
20416
20417 if (Subtarget.hasAVX512()) {
20418 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20419 assert(!Subtarget.hasVLX() && "Unexpected features");
20420 MVT VT = Op->getSimpleValueType(0);
20421
20422 // v8i32->v8f64 is legal with AVX512 so just return it.
20423 if (VT == MVT::v8f64)
20424 return Op;
20425
20426 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64 ||
20427 VT == MVT::v8f16) &&
20428 "Unexpected VT!");
20429 MVT WideVT = VT == MVT::v8f16 ? MVT::v16f16 : MVT::v16f32;
20430 MVT WideIntVT = MVT::v16i32;
20431 if (VT == MVT::v4f64) {
20432 WideVT = MVT::v8f64;
20433 WideIntVT = MVT::v8i32;
20434 }
20435
20436 // Need to concat with zero vector for strict fp to avoid spurious
20437 // exceptions.
20438 SDValue Tmp =
20439 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20440 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20441 DAG.getVectorIdxConstant(0, DL));
20442 SDValue Res, Chain;
20443 if (IsStrict) {
20444 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20445 {Op->getOperand(0), V});
20446 Chain = Res.getValue(1);
20447 } else {
20448 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20449 }
20450
20451 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20452 DAG.getVectorIdxConstant(0, DL));
20453
20454 if (IsStrict)
20455 return DAG.getMergeValues({Res, Chain}, DL);
20456 return Res;
20457 }
20458
20459 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20460 Op->getSimpleValueType(0) == MVT::v4f64) {
20461 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20462 Constant *Bias = ConstantFP::get(
20463 *DAG.getContext(),
20464 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20465 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20466 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20467 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20468 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20469 SDValue VBias = DAG.getMemIntrinsicNode(
20470 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20473
20474 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20475 DAG.getBitcast(MVT::v4i64, VBias));
20476 Or = DAG.getBitcast(MVT::v4f64, Or);
20477
20478 if (IsStrict)
20479 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20480 {Op.getOperand(0), Or, VBias});
20481 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20482 }
20483
20484 // The algorithm is the following:
20485 // #ifdef __SSE4_1__
20486 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20487 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20488 // (uint4) 0x53000000, 0xaa);
20489 // #else
20490 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20491 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20492 // #endif
20493 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20494 // return (float4) lo + fhi;
20495
20496 bool Is128 = VecIntVT == MVT::v4i32;
20497 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20498 // If we convert to something else than the supported type, e.g., to v4f64,
20499 // abort early.
20500 if (VecFloatVT != Op->getSimpleValueType(0))
20501 return SDValue();
20502
20503 // In the #idef/#else code, we have in common:
20504 // - The vector of constants:
20505 // -- 0x4b000000
20506 // -- 0x53000000
20507 // - A shift:
20508 // -- v >> 16
20509
20510 // Create the splat vector for 0x4b000000.
20511 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20512 // Create the splat vector for 0x53000000.
20513 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20514
20515 // Create the right shift.
20516 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20517 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20518
20519 SDValue Low, High;
20520 if (Subtarget.hasSSE41()) {
20521 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20522 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20523 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20524 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20525 // Low will be bitcasted right away, so do not bother bitcasting back to its
20526 // original type.
20527 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20528 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20529 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20530 // (uint4) 0x53000000, 0xaa);
20531 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20532 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20533 // High will be bitcasted right away, so do not bother bitcasting back to
20534 // its original type.
20535 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20536 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20537 } else {
20538 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20539 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20540 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20541 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20542
20543 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20544 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20545 }
20546
20547 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20548 SDValue VecCstFSub = DAG.getConstantFP(
20549 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20550
20551 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20552 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20553 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20554 // enabled. See PR24512.
20555 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20556 // TODO: Are there any fast-math-flags to propagate here?
20557 // (float4) lo;
20558 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20559 // return (float4) lo + fhi;
20560 if (IsStrict) {
20561 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20562 {Op.getOperand(0), HighBitcast, VecCstFSub});
20563 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20564 {FHigh.getValue(1), LowBitcast, FHigh});
20565 }
20566
20567 SDValue FHigh =
20568 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20569 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20570}
20571
20573 const X86Subtarget &Subtarget) {
20574 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20575 SDValue N0 = Op.getOperand(OpNo);
20576 MVT SrcVT = N0.getSimpleValueType();
20577
20578 switch (SrcVT.SimpleTy) {
20579 default:
20580 llvm_unreachable("Custom UINT_TO_FP is not supported!");
20581 case MVT::v2i32:
20582 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
20583 case MVT::v4i32:
20584 case MVT::v8i32:
20585 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
20586 case MVT::v2i64:
20587 case MVT::v4i64:
20588 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20589 }
20590}
20591
20592SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20593 SelectionDAG &DAG) const {
20594 bool IsStrict = Op->isStrictFPOpcode();
20595 unsigned OpNo = IsStrict ? 1 : 0;
20596 SDValue Src = Op.getOperand(OpNo);
20597 SDLoc dl(Op);
20598 auto PtrVT = getPointerTy(DAG.getDataLayout());
20599 MVT SrcVT = Src.getSimpleValueType();
20600 MVT DstVT = Op->getSimpleValueType(0);
20601 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20602
20603 // Bail out when we don't have native conversion instructions.
20604 if (DstVT == MVT::f128)
20605 return SDValue();
20606
20607 if (isSoftF16(DstVT, Subtarget))
20608 return promoteXINT_TO_FP(Op, dl, DAG);
20609 else if (isLegalConversion(SrcVT, DstVT, false, Subtarget))
20610 return Op;
20611
20612 if (DstVT.isVector())
20613 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
20614
20615 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20616 return LowerWin64_INT128_TO_FP(Op, DAG);
20617
20618 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20619 return Extract;
20620
20621 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20622 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20623 // Conversions from unsigned i32 to f32/f64 are legal,
20624 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20625 return Op;
20626 }
20627
20628 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20629 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20630 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20631 if (IsStrict)
20632 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20633 {Chain, Src});
20634 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20635 }
20636
20637 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20638 return V;
20639 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20640 return V;
20641
20642 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20643 // infinity. It produces -0.0, so disable under strictfp.
20644 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
20645 !IsStrict)
20646 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
20647 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
20648 // negative infinity. So disable under strictfp. Using FILD instead.
20649 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
20650 !IsStrict)
20651 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
20652 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20653 (DstVT == MVT::f32 || DstVT == MVT::f64))
20654 return SDValue();
20655
20656 // Make a 64-bit buffer, and use it to build an FILD.
20657 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20658 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20659 Align SlotAlign(8);
20660 MachinePointerInfo MPI =
20662 if (SrcVT == MVT::i32) {
20663 SDValue OffsetSlot =
20664 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
20665 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20666 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20667 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20668 std::pair<SDValue, SDValue> Tmp =
20669 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20670 if (IsStrict)
20671 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20672
20673 return Tmp.first;
20674 }
20675
20676 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
20677 SDValue ValueToStore = Src;
20678 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20679 // Bitcasting to f64 here allows us to do a single 64-bit store from
20680 // an SSE register, avoiding the store forwarding penalty that would come
20681 // with two 32-bit stores.
20682 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20683 }
20684 SDValue Store =
20685 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20686 // For i64 source, we need to add the appropriate power of 2 if the input
20687 // was negative. We must be careful to do the computation in x87 extended
20688 // precision, not in SSE.
20689 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20690 SDValue Ops[] = {Store, StackSlot};
20691 SDValue Fild =
20692 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20693 SlotAlign, MachineMemOperand::MOLoad);
20694 Chain = Fild.getValue(1);
20695
20696 // Check whether the sign bit is set.
20697 SDValue SignSet = DAG.getSetCC(
20698 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20699 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20700
20701 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20702 APInt FF(64, 0x5F80000000000000ULL);
20703 SDValue FudgePtr =
20704 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20705 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20706
20707 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20708 SDValue Zero = DAG.getIntPtrConstant(0, dl);
20709 SDValue Four = DAG.getIntPtrConstant(4, dl);
20710 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20711 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20712
20713 // Load the value out, extending it from f32 to f80.
20714 SDValue Fudge = DAG.getExtLoad(
20715 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20717 CPAlignment);
20718 Chain = Fudge.getValue(1);
20719 // Extend everything to 80 bits to force it to be done on x87.
20720 // TODO: Are there any fast-math-flags to propagate here?
20721 if (IsStrict) {
20722 unsigned Opc = ISD::STRICT_FADD;
20723 // Windows needs the precision control changed to 80bits around this add.
20724 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20726
20727 SDValue Add =
20728 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
20729 // STRICT_FP_ROUND can't handle equal types.
20730 if (DstVT == MVT::f80)
20731 return Add;
20732 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20733 {Add.getValue(1), Add,
20734 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
20735 }
20736 unsigned Opc = ISD::FADD;
20737 // Windows needs the precision control changed to 80bits around this add.
20738 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20740
20741 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
20742 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20743 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
20744}
20745
20746// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20747// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20748// just return an SDValue().
20749// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20750// to i16, i32 or i64, and we lower it to a legal sequence and return the
20751// result.
20752SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20753 bool IsSigned,
20754 SDValue &Chain) const {
20755 bool IsStrict = Op->isStrictFPOpcode();
20756 SDLoc DL(Op);
20757
20758 EVT DstTy = Op.getValueType();
20759 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20760 EVT TheVT = Value.getValueType();
20761 auto PtrVT = getPointerTy(DAG.getDataLayout());
20762
20763 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20764 // f16 must be promoted before using the lowering in this routine.
20765 // fp128 does not use this lowering.
20766 return SDValue();
20767 }
20768
20769 // If using FIST to compute an unsigned i64, we'll need some fixup
20770 // to handle values above the maximum signed i64. A FIST is always
20771 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20772 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20773
20774 // FIXME: This does not generate an invalid exception if the input does not
20775 // fit in i32. PR44019
20776 if (!IsSigned && DstTy != MVT::i64) {
20777 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20778 // The low 32 bits of the fist result will have the correct uint32 result.
20779 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20780 DstTy = MVT::i64;
20781 }
20782
20783 assert(DstTy.getSimpleVT() <= MVT::i64 &&
20784 DstTy.getSimpleVT() >= MVT::i16 &&
20785 "Unknown FP_TO_INT to lower!");
20786
20787 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20788 // stack slot.
20789 MachineFunction &MF = DAG.getMachineFunction();
20790 unsigned MemSize = DstTy.getStoreSize();
20791 int SSFI =
20792 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20793 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20794
20795 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20796
20797 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20798
20799 if (UnsignedFixup) {
20800 //
20801 // Conversion to unsigned i64 is implemented with a select,
20802 // depending on whether the source value fits in the range
20803 // of a signed i64. Let Thresh be the FP equivalent of
20804 // 0x8000000000000000ULL.
20805 //
20806 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20807 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20808 // FistSrc = (Value - FltOfs);
20809 // Fist-to-mem64 FistSrc
20810 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20811 // to XOR'ing the high 32 bits with Adjust.
20812 //
20813 // Being a power of 2, Thresh is exactly representable in all FP formats.
20814 // For X87 we'd like to use the smallest FP type for this constant, but
20815 // for DAG type consistency we have to match the FP operand type.
20816
20817 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20819 bool LosesInfo = false;
20820 if (TheVT == MVT::f64)
20821 // The rounding mode is irrelevant as the conversion should be exact.
20822 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
20823 &LosesInfo);
20824 else if (TheVT == MVT::f80)
20825 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20826 APFloat::rmNearestTiesToEven, &LosesInfo);
20827
20828 assert(Status == APFloat::opOK && !LosesInfo &&
20829 "FP conversion should have been exact");
20830
20831 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20832
20833 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20834 *DAG.getContext(), TheVT);
20835 SDValue Cmp;
20836 if (IsStrict) {
20837 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20838 /*IsSignaling*/ true);
20839 Chain = Cmp.getValue(1);
20840 } else {
20841 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20842 }
20843
20844 // Our preferred lowering of
20845 //
20846 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20847 //
20848 // is
20849 //
20850 // (Value >= Thresh) << 63
20851 //
20852 // but since we can get here after LegalOperations, DAGCombine might do the
20853 // wrong thing if we create a select. So, directly create the preferred
20854 // version.
20855 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20856 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20857 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20858
20859 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20860 DAG.getConstantFP(0.0, DL, TheVT));
20861
20862 if (IsStrict) {
20863 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20864 { Chain, Value, FltOfs });
20865 Chain = Value.getValue(1);
20866 } else
20867 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20868 }
20869
20870 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
20871
20872 // FIXME This causes a redundant load/store if the SSE-class value is already
20873 // in memory, such as if it is on the callstack.
20874 if (isScalarFPTypeInSSEReg(TheVT)) {
20875 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20876 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20877 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20878 SDValue Ops[] = { Chain, StackSlot };
20879
20880 unsigned FLDSize = TheVT.getStoreSize();
20881 assert(FLDSize <= MemSize && "Stack slot not big enough");
20882 MachineMemOperand *MMO = MF.getMachineMemOperand(
20883 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20884 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20885 Chain = Value.getValue(1);
20886 }
20887
20888 // Build the FP_TO_INT*_IN_MEM
20889 MachineMemOperand *MMO = MF.getMachineMemOperand(
20890 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20891 SDValue Ops[] = { Chain, Value, StackSlot };
20893 DAG.getVTList(MVT::Other),
20894 Ops, DstTy, MMO);
20895
20896 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
20897 Chain = Res.getValue(1);
20898
20899 // If we need an unsigned fixup, XOR the result with adjust.
20900 if (UnsignedFixup)
20901 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20902
20903 return Res;
20904}
20905
20907 const X86Subtarget &Subtarget) {
20908 MVT VT = Op.getSimpleValueType();
20909 SDValue In = Op.getOperand(0);
20910 MVT InVT = In.getSimpleValueType();
20911 unsigned Opc = Op.getOpcode();
20912
20913 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20915 "Unexpected extension opcode");
20917 "Expected same number of elements");
20918 assert((VT.getVectorElementType() == MVT::i16 ||
20919 VT.getVectorElementType() == MVT::i32 ||
20920 VT.getVectorElementType() == MVT::i64) &&
20921 "Unexpected element type");
20922 assert((InVT.getVectorElementType() == MVT::i8 ||
20923 InVT.getVectorElementType() == MVT::i16 ||
20924 InVT.getVectorElementType() == MVT::i32) &&
20925 "Unexpected element type");
20926
20927 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
20928
20929 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20930 assert(InVT == MVT::v32i8 && "Unexpected VT!");
20931 return splitVectorIntUnary(Op, DAG, dl);
20932 }
20933
20934 if (Subtarget.hasInt256())
20935 return Op;
20936
20937 // Optimize vectors in AVX mode:
20938 //
20939 // v8i16 -> v8i32
20940 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20941 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20942 // Concat upper and lower parts.
20943 //
20944 // v4i32 -> v4i64
20945 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20946 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20947 // Concat upper and lower parts.
20948 //
20949 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20950 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20951
20952 // Short-circuit if we can determine that each 128-bit half is the same value.
20953 // Otherwise, this is difficult to match and optimize.
20954 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20955 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20956 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20957
20958 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20959 SDValue Undef = DAG.getUNDEF(InVT);
20960 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20961 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20962 OpHi = DAG.getBitcast(HalfVT, OpHi);
20963
20964 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20965}
20966
20967// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20968static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20969 const SDLoc &dl, SelectionDAG &DAG) {
20970 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20971 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20972 DAG.getVectorIdxConstant(0, dl));
20973 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20974 DAG.getVectorIdxConstant(8, dl));
20975 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20976 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20977 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20978 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20979}
20980
20982 const X86Subtarget &Subtarget,
20983 SelectionDAG &DAG) {
20984 MVT VT = Op->getSimpleValueType(0);
20985 SDValue In = Op->getOperand(0);
20986 MVT InVT = In.getSimpleValueType();
20987 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20988 unsigned NumElts = VT.getVectorNumElements();
20989
20990 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20991 // avoids a constant pool load.
20992 if (VT.getVectorElementType() != MVT::i8) {
20993 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20994 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20995 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20996 }
20997
20998 // Extend VT if BWI is not supported.
20999 MVT ExtVT = VT;
21000 if (!Subtarget.hasBWI()) {
21001 // If v16i32 is to be avoided, we'll need to split and concatenate.
21002 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
21003 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
21004
21005 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
21006 }
21007
21008 // Widen to 512-bits if VLX is not supported.
21009 MVT WideVT = ExtVT;
21010 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
21011 NumElts *= 512 / ExtVT.getSizeInBits();
21012 InVT = MVT::getVectorVT(MVT::i1, NumElts);
21013 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In,
21014 DAG.getVectorIdxConstant(0, DL));
21015 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
21016 }
21017
21018 SDValue One = DAG.getConstant(1, DL, WideVT);
21019 SDValue Zero = DAG.getConstant(0, DL, WideVT);
21020
21021 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
21022
21023 // Truncate if we had to extend above.
21024 if (VT != ExtVT) {
21025 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
21026 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
21027 }
21028
21029 // Extract back to 128/256-bit if we widened.
21030 if (WideVT != VT)
21031 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
21032 DAG.getVectorIdxConstant(0, DL));
21033
21034 return SelectedVal;
21035}
21036
21038 SelectionDAG &DAG) {
21039 SDValue In = Op.getOperand(0);
21040 MVT SVT = In.getSimpleValueType();
21041 SDLoc DL(Op);
21042
21043 if (SVT.getVectorElementType() == MVT::i1)
21044 return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG);
21045
21046 assert(Subtarget.hasAVX() && "Expected AVX support");
21047 return LowerAVXExtend(Op, DL, DAG, Subtarget);
21048}
21049
21050/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
21051/// It makes use of the fact that vectors with enough leading sign/zero bits
21052/// prevent the PACKSS/PACKUS from saturating the results.
21053/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
21054/// within each 128-bit lane.
21055static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
21056 const SDLoc &DL, SelectionDAG &DAG,
21057 const X86Subtarget &Subtarget) {
21058 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
21059 "Unexpected PACK opcode");
21060 assert(DstVT.isVector() && "VT not a vector?");
21061
21062 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
21063 if (!Subtarget.hasSSE2())
21064 return SDValue();
21065
21066 EVT SrcVT = In.getValueType();
21067
21068 // No truncation required, we might get here due to recursive calls.
21069 if (SrcVT == DstVT)
21070 return In;
21071
21072 unsigned NumElems = SrcVT.getVectorNumElements();
21073 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
21074 return SDValue();
21075
21076 unsigned DstSizeInBits = DstVT.getSizeInBits();
21077 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
21078 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
21079 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
21080
21081 LLVMContext &Ctx = *DAG.getContext();
21082 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
21083 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21084
21085 // Pack to the largest type possible:
21086 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
21087 EVT InVT = MVT::i16, OutVT = MVT::i8;
21088 if (SrcVT.getScalarSizeInBits() > 16 &&
21089 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
21090 InVT = MVT::i32;
21091 OutVT = MVT::i16;
21092 }
21093
21094 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
21095 // On pre-AVX512, pack the src in both halves to help value tracking.
21096 if (SrcSizeInBits <= 128) {
21097 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
21098 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
21099 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
21100 SDValue LHS = DAG.getBitcast(InVT, In);
21101 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
21102 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
21103 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
21104 Res = DAG.getBitcast(PackedVT, Res);
21105 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21106 }
21107
21108 // Split lower/upper subvectors.
21109 SDValue Lo, Hi;
21110 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
21111
21112 // If Hi is undef, then don't bother packing it and widen the result instead.
21113 if (Hi.isUndef()) {
21114 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
21115 if (SDValue Res =
21116 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
21117 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
21118 }
21119
21120 unsigned SubSizeInBits = SrcSizeInBits / 2;
21121 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
21122 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
21123
21124 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
21125 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
21126 Lo = DAG.getBitcast(InVT, Lo);
21127 Hi = DAG.getBitcast(InVT, Hi);
21128 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21129 return DAG.getBitcast(DstVT, Res);
21130 }
21131
21132 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
21133 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
21134 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
21135 Lo = DAG.getBitcast(InVT, Lo);
21136 Hi = DAG.getBitcast(InVT, Hi);
21137 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21138
21139 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
21140 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
21141 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
21143 int Scale = 64 / OutVT.getScalarSizeInBits();
21144 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
21145 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
21146
21147 if (DstVT.is256BitVector())
21148 return DAG.getBitcast(DstVT, Res);
21149
21150 // If 512bit -> 128bit truncate another stage.
21151 Res = DAG.getBitcast(PackedVT, Res);
21152 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21153 }
21154
21155 // Recursively pack lower/upper subvectors, concat result and pack again.
21156 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
21157
21158 if (PackedVT.is128BitVector()) {
21159 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
21160 // type legalization.
21161 SDValue Res =
21162 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
21163 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21164 }
21165
21166 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
21167 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
21168 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
21169 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
21170 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21171}
21172
21173/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
21174/// e.g. trunc <8 x i32> X to <8 x i16> -->
21175/// MaskX = X & 0xffff (clear high bits to prevent saturation)
21176/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
21178 const X86Subtarget &Subtarget,
21179 SelectionDAG &DAG) {
21180 In = DAG.getZeroExtendInReg(In, DL, DstVT);
21181 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
21182}
21183
21184/// Truncate using inreg sign extension and X86ISD::PACKSS.
21186 const X86Subtarget &Subtarget,
21187 SelectionDAG &DAG) {
21188 EVT SrcVT = In.getValueType();
21189 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
21190 DAG.getValueType(DstVT));
21191 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
21192}
21193
21194/// Helper to determine if \p In truncated to \p DstVT has the necessary
21195/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
21196/// possibly by converting a SRL node to SRA for sign extension.
21197static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
21198 SDValue In, const SDLoc &DL,
21199 SelectionDAG &DAG,
21200 const X86Subtarget &Subtarget,
21201 const SDNodeFlags Flags = SDNodeFlags()) {
21202 // Requires SSE2.
21203 if (!Subtarget.hasSSE2())
21204 return SDValue();
21205
21206 EVT SrcVT = In.getValueType();
21207 EVT DstSVT = DstVT.getVectorElementType();
21208 EVT SrcSVT = SrcVT.getVectorElementType();
21209 unsigned NumDstEltBits = DstSVT.getSizeInBits();
21210 unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
21211
21212 // Check we have a truncation suited for PACKSS/PACKUS.
21213 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21214 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21215 return SDValue();
21216
21217 assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
21218 unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
21219
21220 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
21221 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
21222 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
21223 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
21224 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
21225 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
21226 return SDValue();
21227
21228 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
21229 // split this for packing.
21230 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
21231 !isFreeToSplitVector(In, DAG) &&
21232 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
21233 return SDValue();
21234
21235 // Don't truncate AVX512 targets as multiple PACK nodes stages.
21236 if (Subtarget.hasAVX512() && NumStages > 1)
21237 return SDValue();
21238
21239 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
21240 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21241
21242 // Truncate with PACKUS if we are truncating a vector with leading zero
21243 // bits that extend all the way to the packed/truncated value.
21244 // e.g. Masks, zext_in_reg, etc.
21245 // Pre-SSE41 we can only use PACKUSWB.
21246 KnownBits Known = DAG.computeKnownBits(In);
21247 if ((Flags.hasNoUnsignedWrap() && NumDstEltBits <= NumPackedZeroBits) ||
21248 (NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
21249 PackOpcode = X86ISD::PACKUS;
21250 return In;
21251 }
21252
21253 // Truncate with PACKSS if we are truncating a vector with sign-bits
21254 // that extend all the way to the packed/truncated value.
21255 // e.g. Comparison result, sext_in_reg, etc.
21256 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
21257
21258 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
21259 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
21260 // see through BITCASTs later on and combines/simplifications can't then use
21261 // it.
21262 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
21263 !Subtarget.hasAVX512())
21264 return SDValue();
21265
21266 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
21267 if ((Flags.hasNoSignedWrap() && DstSVT != MVT::i32) ||
21268 MinSignBits < NumSignBits) {
21269 PackOpcode = X86ISD::PACKSS;
21270 return In;
21271 }
21272
21273 // If we have a srl that only generates signbits that we will discard in
21274 // the truncation then we can use PACKSS by converting the srl to a sra.
21275 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
21276 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
21277 if (std::optional<unsigned> ShAmt = DAG.getValidShiftAmount(In)) {
21278 if (*ShAmt == MinSignBits) {
21279 PackOpcode = X86ISD::PACKSS;
21280 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
21281 }
21282 }
21283
21284 return SDValue();
21285}
21286
21287/// This function lowers a vector truncation of 'extended sign-bits' or
21288/// 'extended zero-bits' values.
21289/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
21291 MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget,
21292 SelectionDAG &DAG, const SDNodeFlags Flags = SDNodeFlags()) {
21293 MVT SrcVT = In.getSimpleValueType();
21294 MVT DstSVT = DstVT.getVectorElementType();
21295 MVT SrcSVT = SrcVT.getVectorElementType();
21296 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21297 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21298 return SDValue();
21299
21300 // If the upper half of the source is undef, then attempt to split and
21301 // only truncate the lower half.
21302 if (DstVT.getSizeInBits() >= 128) {
21303 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21304 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21305 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
21306 Subtarget, DAG))
21307 return widenSubVector(Res, false, Subtarget, DAG, DL,
21308 DstVT.getSizeInBits());
21309 }
21310 }
21311
21312 unsigned PackOpcode;
21313 if (SDValue Src = matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG,
21314 Subtarget, Flags))
21315 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
21316
21317 return SDValue();
21318}
21319
21320/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
21321/// X86ISD::PACKUS/X86ISD::PACKSS operations.
21323 const X86Subtarget &Subtarget,
21324 SelectionDAG &DAG) {
21325 MVT SrcVT = In.getSimpleValueType();
21326 MVT DstSVT = DstVT.getVectorElementType();
21327 MVT SrcSVT = SrcVT.getVectorElementType();
21328 unsigned NumElems = DstVT.getVectorNumElements();
21329 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21330 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
21331 NumElems >= 8))
21332 return SDValue();
21333
21334 // SSSE3's pshufb results in less instructions in the cases below.
21335 if (Subtarget.hasSSSE3() && NumElems == 8) {
21336 if (SrcSVT == MVT::i16)
21337 return SDValue();
21338 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
21339 return SDValue();
21340 }
21341
21342 // If the upper half of the source is undef, then attempt to split and
21343 // only truncate the lower half.
21344 if (DstVT.getSizeInBits() >= 128) {
21345 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21346 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21347 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
21348 return widenSubVector(Res, false, Subtarget, DAG, DL,
21349 DstVT.getSizeInBits());
21350 }
21351 }
21352
21353 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
21354 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
21355 // truncate 2 x v4i32 to v8i16.
21356 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
21357 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
21358
21359 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
21360 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
21361
21362 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
21363 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
21364 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
21365 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
21366 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
21367 }
21368
21369 return SDValue();
21370}
21371
21373 SelectionDAG &DAG,
21374 const X86Subtarget &Subtarget) {
21375 MVT VT = Op.getSimpleValueType();
21376 SDValue In = Op.getOperand(0);
21377 MVT InVT = In.getSimpleValueType();
21378 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
21379
21380 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21381 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21382 if (InVT.getScalarSizeInBits() <= 16) {
21383 if (Subtarget.hasBWI()) {
21384 // legal, will go to VPMOVB2M, VPMOVW2M
21385 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21386 // We need to shift to get the lsb into sign position.
21387 // Shift packed bytes not supported natively, bitcast to word
21388 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21389 In = DAG.getNode(ISD::SHL, DL, ExtVT,
21390 DAG.getBitcast(ExtVT, In),
21391 DAG.getConstant(ShiftInx, DL, ExtVT));
21392 In = DAG.getBitcast(InVT, In);
21393 }
21394 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21395 In, ISD::SETGT);
21396 }
21397 // Use TESTD/Q, extended vector to packed dword/qword.
21398 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
21399 "Unexpected vector type.");
21400 unsigned NumElts = InVT.getVectorNumElements();
21401 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
21402 // We need to change to a wider element type that we have support for.
21403 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21404 // For 16 element vectors we extend to v16i32 unless we are explicitly
21405 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21406 // we need to split into two 8 element vectors which we can extend to v8i32,
21407 // truncate and concat the results. There's an additional complication if
21408 // the original type is v16i8. In that case we can't split the v16i8
21409 // directly, so we need to shuffle high elements to low and use
21410 // sign_extend_vector_inreg.
21411 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21412 SDValue Lo, Hi;
21413 if (InVT == MVT::v16i8) {
21414 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21415 Hi = DAG.getVectorShuffle(
21416 InVT, DL, In, In,
21417 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21418 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21419 } else {
21420 assert(InVT == MVT::v16i16 && "Unexpected VT!");
21421 Lo = extract128BitVector(In, 0, DAG, DL);
21422 Hi = extract128BitVector(In, 8, DAG, DL);
21423 }
21424 // We're split now, just emit two truncates and a concat. The two
21425 // truncates will trigger legalization to come back to this function.
21426 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21427 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21428 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21429 }
21430 // We either have 8 elements or we're allowed to use 512-bit vectors.
21431 // If we have VLX, we want to use the narrowest vector that can get the
21432 // job done so we use vXi32.
21433 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21434 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21435 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21436 InVT = ExtVT;
21437 ShiftInx = InVT.getScalarSizeInBits() - 1;
21438 }
21439
21440 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21441 // We need to shift to get the lsb into sign position.
21442 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21443 DAG.getConstant(ShiftInx, DL, InVT));
21444 }
21445 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21446 if (Subtarget.hasDQI())
21447 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21448 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21449}
21450
21451SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21452 SDLoc DL(Op);
21453 MVT VT = Op.getSimpleValueType();
21454 SDValue In = Op.getOperand(0);
21455 MVT InVT = In.getSimpleValueType();
21457 "Invalid TRUNCATE operation");
21458
21459 // If we're called by the type legalizer, handle a few cases.
21460 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21461 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
21462 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21463 VT.is128BitVector() && Subtarget.hasAVX512()) {
21464 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21465 "Unexpected subtarget!");
21466 // The default behavior is to truncate one step, concatenate, and then
21467 // truncate the remainder. We'd rather produce two 64-bit results and
21468 // concatenate those.
21469 SDValue Lo, Hi;
21470 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21471
21472 EVT LoVT, HiVT;
21473 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21474
21475 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21476 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21477 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21478 }
21479
21480 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
21481 if (!Subtarget.hasAVX512() ||
21482 (InVT.is512BitVector() && VT.is256BitVector()))
21484 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21485 return SignPack;
21486
21487 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
21488 if (!Subtarget.hasAVX512())
21489 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
21490
21491 // Otherwise let default legalization handle it.
21492 return SDValue();
21493 }
21494
21495 if (VT.getVectorElementType() == MVT::i1)
21496 return LowerTruncateVecI1(Op, DL, DAG, Subtarget);
21497
21498 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
21499 // concat from subvectors to use VPTRUNC etc.
21500 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In, DAG))
21502 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21503 return SignPack;
21504
21505 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21506 if (Subtarget.hasAVX512()) {
21507 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21508 assert(VT == MVT::v32i8 && "Unexpected VT!");
21509 return splitVectorIntUnary(Op, DAG, DL);
21510 }
21511
21512 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21513 // and then truncate that. But we should only do that if we haven't been
21514 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21515 // handled by isel patterns.
21516 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21517 Subtarget.canExtendTo512DQ())
21518 return Op;
21519 }
21520
21521 // Handle truncation of V256 to V128 using shuffles.
21522 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
21523
21524 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21525 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21526 if (Subtarget.hasInt256()) {
21527 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21528 In = DAG.getBitcast(MVT::v8i32, In);
21529 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21530 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21531 DAG.getVectorIdxConstant(0, DL));
21532 }
21533
21534 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21535 DAG.getVectorIdxConstant(0, DL));
21536 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21537 DAG.getVectorIdxConstant(2, DL));
21538 static const int ShufMask[] = {0, 2, 4, 6};
21539 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
21540 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
21541 }
21542
21543 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21544 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21545 if (Subtarget.hasInt256()) {
21546 // The PSHUFB mask:
21547 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21548 -1, -1, -1, -1, -1, -1, -1, -1,
21549 16, 17, 20, 21, 24, 25, 28, 29,
21550 -1, -1, -1, -1, -1, -1, -1, -1 };
21551 In = DAG.getBitcast(MVT::v32i8, In);
21552 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21553 In = DAG.getBitcast(MVT::v4i64, In);
21554
21555 static const int ShufMask2[] = {0, 2, -1, -1};
21556 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21557 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21558 DAG.getVectorIdxConstant(0, DL));
21559 return DAG.getBitcast(MVT::v8i16, In);
21560 }
21561
21562 return Subtarget.hasSSE41()
21563 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
21564 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
21565 }
21566
21567 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
21568 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
21569
21570 llvm_unreachable("All 256->128 cases should have been handled above!");
21571}
21572
21573// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21574// behaves on out of range inputs to generate optimized conversions.
21576 SelectionDAG &DAG,
21577 const X86Subtarget &Subtarget) {
21578 MVT SrcVT = Src.getSimpleValueType();
21579 unsigned DstBits = VT.getScalarSizeInBits();
21580 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
21581
21582 // Calculate the converted result for values in the range 0 to
21583 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21584 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21585 SDValue Big =
21586 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21587 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21588 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21589
21590 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21591 // and only if the value was out of range. So we can use that
21592 // as our indicator that we rather use "Big" instead of "Small".
21593 //
21594 // Use "Small" if "IsOverflown" has all bits cleared
21595 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21596
21597 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21598 // use the slightly slower blendv select instead.
21599 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21600 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21601 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21602 }
21603
21604 SDValue IsOverflown =
21605 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21606 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21607 return DAG.getNode(ISD::OR, dl, VT, Small,
21608 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21609}
21610
21611SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21612 bool IsStrict = Op->isStrictFPOpcode();
21613 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21614 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21615 bool HasVLX = Subtarget.hasVLX();
21616 MVT VT = Op->getSimpleValueType(0);
21617 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21618 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
21619 MVT SrcVT = Src.getSimpleValueType();
21620 SDLoc dl(Op);
21621
21622 SDValue Res;
21623 if (isSoftF16(SrcVT, Subtarget)) {
21624 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
21625 if (IsStrict)
21626 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
21627 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
21628 {NVT, MVT::Other}, {Chain, Src})});
21629 return DAG.getNode(Op.getOpcode(), dl, VT,
21630 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
21631 } else if (isTypeLegal(SrcVT) &&
21632 isLegalConversion(VT, SrcVT, IsSigned, Subtarget)) {
21633 return Op;
21634 }
21635
21636 if (VT.isVector()) {
21637 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21638 MVT ResVT = MVT::v4i32;
21639 MVT TruncVT = MVT::v4i1;
21640 unsigned Opc;
21641 if (IsStrict)
21643 else
21644 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21645
21646 if (!IsSigned && !HasVLX) {
21647 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21648 // Widen to 512-bits.
21649 ResVT = MVT::v8i32;
21650 TruncVT = MVT::v8i1;
21651 Opc = Op.getOpcode();
21652 // Need to concat with zero vector for strict fp to avoid spurious
21653 // exceptions.
21654 // TODO: Should we just do this for non-strict as well?
21655 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21656 : DAG.getUNDEF(MVT::v8f64);
21657 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21658 DAG.getVectorIdxConstant(0, dl));
21659 }
21660 if (IsStrict) {
21661 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
21662 Chain = Res.getValue(1);
21663 } else {
21664 Res = DAG.getNode(Opc, dl, ResVT, Src);
21665 }
21666
21667 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21668 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21669 DAG.getVectorIdxConstant(0, dl));
21670 if (IsStrict)
21671 return DAG.getMergeValues({Res, Chain}, dl);
21672 return Res;
21673 }
21674
21675 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
21676 if ((HasVLX && (VT == MVT::v8i16 || VT == MVT::v16i16)) ||
21677 VT == MVT::v32i16)
21678 return Op;
21679
21680 MVT ResVT = VT;
21681 MVT EleVT = VT.getVectorElementType();
21682 if (EleVT != MVT::i64)
21683 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
21684
21685 if (SrcVT == MVT::v2f16 || SrcVT == MVT::v4f16) {
21686 SDValue Tmp =
21687 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
21688 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
21689 Ops[0] = Src;
21690 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
21691 }
21692
21693 if (!HasVLX) {
21694 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21695 // Widen to 512-bits.
21696 unsigned IntSize = EleVT.getSizeInBits();
21697 unsigned Num = IntSize > 16 ? 512 / IntSize : 32;
21698 ResVT = MVT::getVectorVT(EleVT, Num);
21699 Src = widenSubVector(MVT::getVectorVT(MVT::f16, Num), Src, IsStrict,
21700 Subtarget, DAG, dl);
21701 }
21702
21703 if (IsStrict) {
21704 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
21706 dl, {ResVT, MVT::Other}, {Chain, Src});
21707 Chain = Res.getValue(1);
21708 } else {
21709 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
21710 ResVT, Src);
21711 }
21712
21713 // TODO: Need to add exception check code for strict FP.
21714 if (EleVT.getSizeInBits() < 16) {
21715 if (HasVLX)
21716 ResVT = MVT::getVectorVT(EleVT, 8);
21717 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
21718 }
21719
21720 if (ResVT != VT)
21721 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21722 DAG.getVectorIdxConstant(0, dl));
21723
21724 if (IsStrict)
21725 return DAG.getMergeValues({Res, Chain}, dl);
21726 return Res;
21727 }
21728
21729 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
21730 if (VT.getVectorElementType() == MVT::i16) {
21731 assert((SrcVT.getVectorElementType() == MVT::f32 ||
21732 SrcVT.getVectorElementType() == MVT::f64) &&
21733 "Expected f32/f64 vector!");
21734 MVT NVT = VT.changeVectorElementType(MVT::i32);
21735 if (IsStrict) {
21736 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
21738 dl, {NVT, MVT::Other}, {Chain, Src});
21739 Chain = Res.getValue(1);
21740 } else {
21741 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
21742 NVT, Src);
21743 }
21744
21745 // TODO: Need to add exception check code for strict FP.
21746 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21747
21748 if (IsStrict)
21749 return DAG.getMergeValues({Res, Chain}, dl);
21750 return Res;
21751 }
21752
21753 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21754 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21755 assert(!IsSigned && "Expected unsigned conversion!");
21756 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
21757 return Op;
21758 }
21759
21760 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21761 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21762 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21763 Subtarget.useAVX512Regs()) {
21764 assert(!IsSigned && "Expected unsigned conversion!");
21765 assert(!Subtarget.hasVLX() && "Unexpected features!");
21766 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21767 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21768 // Need to concat with zero vector for strict fp to avoid spurious
21769 // exceptions.
21770 // TODO: Should we just do this for non-strict as well?
21771 SDValue Tmp =
21772 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21773 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21774 DAG.getVectorIdxConstant(0, dl));
21775
21776 if (IsStrict) {
21777 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21778 {Chain, Src});
21779 Chain = Res.getValue(1);
21780 } else {
21781 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21782 }
21783
21784 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21785 DAG.getVectorIdxConstant(0, dl));
21786
21787 if (IsStrict)
21788 return DAG.getMergeValues({Res, Chain}, dl);
21789 return Res;
21790 }
21791
21792 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21793 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21794 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21795 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21796 assert(!Subtarget.hasVLX() && "Unexpected features!");
21797 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21798 // Need to concat with zero vector for strict fp to avoid spurious
21799 // exceptions.
21800 // TODO: Should we just do this for non-strict as well?
21801 SDValue Tmp =
21802 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21803 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21804 DAG.getVectorIdxConstant(0, dl));
21805
21806 if (IsStrict) {
21807 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21808 {Chain, Src});
21809 Chain = Res.getValue(1);
21810 } else {
21811 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21812 }
21813
21814 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21815 DAG.getVectorIdxConstant(0, dl));
21816
21817 if (IsStrict)
21818 return DAG.getMergeValues({Res, Chain}, dl);
21819 return Res;
21820 }
21821
21822 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21823 if (!Subtarget.hasVLX()) {
21824 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21825 // legalizer and then widened again by vector op legalization.
21826 if (!IsStrict)
21827 return SDValue();
21828
21829 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21830 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21831 {Src, Zero, Zero, Zero});
21832 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21833 {Chain, Tmp});
21834 SDValue Chain = Tmp.getValue(1);
21835 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21836 DAG.getVectorIdxConstant(0, dl));
21837 return DAG.getMergeValues({Tmp, Chain}, dl);
21838 }
21839
21840 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21841 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21842 DAG.getUNDEF(MVT::v2f32));
21843 if (IsStrict) {
21844 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21846 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21847 }
21848 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21849 return DAG.getNode(Opc, dl, VT, Tmp);
21850 }
21851
21852 // Generate optimized instructions for pre AVX512 unsigned conversions from
21853 // vXf32 to vXi32.
21854 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21855 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21856 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21857 assert(!IsSigned && "Expected unsigned conversion!");
21858 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21859 }
21860
21861 return SDValue();
21862 }
21863
21864 assert(!VT.isVector());
21865
21866 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21867
21868 if (!IsSigned && UseSSEReg) {
21869 // Conversions from f32/f64 with AVX512 should be legal.
21870 if (Subtarget.hasAVX512())
21871 return Op;
21872
21873 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21874 // behaves on out of range inputs to generate optimized conversions.
21875 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21876 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21877 unsigned DstBits = VT.getScalarSizeInBits();
21878 APInt UIntLimit = APInt::getSignMask(DstBits);
21879 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21880 DAG.getConstant(UIntLimit, dl, VT));
21881 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21882
21883 // Calculate the converted result for values in the range:
21884 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21885 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21886 SDValue Small =
21887 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21888 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21889 SDValue Big = DAG.getNode(
21890 X86ISD::CVTTS2SI, dl, VT,
21891 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21892 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21893
21894 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21895 // and only if the value was out of range. So we can use that
21896 // as our indicator that we rather use "Big" instead of "Small".
21897 //
21898 // Use "Small" if "IsOverflown" has all bits cleared
21899 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21900 SDValue IsOverflown = DAG.getNode(
21901 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21902 return DAG.getNode(ISD::OR, dl, VT, Small,
21903 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21904 }
21905
21906 // Use default expansion for i64.
21907 if (VT == MVT::i64)
21908 return SDValue();
21909
21910 assert(VT == MVT::i32 && "Unexpected VT!");
21911
21912 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21913 // FIXME: This does not generate an invalid exception if the input does not
21914 // fit in i32. PR44019
21915 if (Subtarget.is64Bit()) {
21916 if (IsStrict) {
21917 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21918 {Chain, Src});
21919 Chain = Res.getValue(1);
21920 } else
21921 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21922
21923 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21924 if (IsStrict)
21925 return DAG.getMergeValues({Res, Chain}, dl);
21926 return Res;
21927 }
21928
21929 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21930 // use fisttp which will be handled later.
21931 if (!Subtarget.hasSSE3())
21932 return SDValue();
21933 }
21934
21935 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21936 // FIXME: This does not generate an invalid exception if the input does not
21937 // fit in i16. PR44019
21938 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21939 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21940 if (IsStrict) {
21941 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21942 {Chain, Src});
21943 Chain = Res.getValue(1);
21944 } else
21945 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21946
21947 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21948 if (IsStrict)
21949 return DAG.getMergeValues({Res, Chain}, dl);
21950 return Res;
21951 }
21952
21953 // If this is a FP_TO_SINT using SSEReg we're done.
21954 if (UseSSEReg && IsSigned)
21955 return Op;
21956
21957 // fp128 needs to use a libcall.
21958 if (SrcVT == MVT::f128) {
21959 RTLIB::Libcall LC;
21960 if (IsSigned)
21961 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21962 else
21963 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21964
21965 MakeLibCallOptions CallOptions;
21966 std::pair<SDValue, SDValue> Tmp =
21967 makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
21968
21969 if (IsStrict)
21970 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21971
21972 return Tmp.first;
21973 }
21974
21975 // Fall back to X87.
21976 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21977 if (IsStrict)
21978 return DAG.getMergeValues({V, Chain}, dl);
21979 return V;
21980 }
21981
21982 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21983}
21984
21985SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21986 SelectionDAG &DAG) const {
21987 SDValue Src = Op.getOperand(0);
21988 EVT DstVT = Op.getSimpleValueType();
21989 MVT SrcVT = Src.getSimpleValueType();
21990
21991 if (SrcVT.isVector())
21992 return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
21993
21994 if (SrcVT == MVT::f16)
21995 return SDValue();
21996
21997 // If the source is in an SSE register, the node is Legal.
21998 if (isScalarFPTypeInSSEReg(SrcVT))
21999 return Op;
22000
22001 return LRINT_LLRINTHelper(Op.getNode(), DAG);
22002}
22003
22004SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
22005 SelectionDAG &DAG) const {
22006 EVT DstVT = N->getValueType(0);
22007 SDValue Src = N->getOperand(0);
22008 EVT SrcVT = Src.getValueType();
22009
22010 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
22011 // f16 must be promoted before using the lowering in this routine.
22012 // fp128 does not use this lowering.
22013 return SDValue();
22014 }
22015
22016 SDLoc DL(N);
22017 SDValue Chain = DAG.getEntryNode();
22018
22019 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
22020
22021 // If we're converting from SSE, the stack slot needs to hold both types.
22022 // Otherwise it only needs to hold the DstVT.
22023 EVT OtherVT = UseSSE ? SrcVT : DstVT;
22024 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
22025 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
22026 MachinePointerInfo MPI =
22028
22029 if (UseSSE) {
22030 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
22031 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
22032 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22033 SDValue Ops[] = { Chain, StackPtr };
22034
22035 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
22036 /*Align*/ std::nullopt,
22038 Chain = Src.getValue(1);
22039 }
22040
22041 SDValue StoreOps[] = { Chain, Src, StackPtr };
22042 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
22043 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
22045
22046 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
22047}
22048
22049SDValue
22050X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
22051 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
22052 // but making use of X86 specifics to produce better instruction sequences.
22053 SDNode *Node = Op.getNode();
22054 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
22055 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
22056 SDLoc dl(SDValue(Node, 0));
22057 SDValue Src = Node->getOperand(0);
22058
22059 // There are three types involved here: SrcVT is the source floating point
22060 // type, DstVT is the type of the result, and TmpVT is the result of the
22061 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
22062 // DstVT).
22063 EVT SrcVT = Src.getValueType();
22064 EVT DstVT = Node->getValueType(0);
22065 EVT TmpVT = DstVT;
22066
22067 // This code is only for floats and doubles. Fall back to generic code for
22068 // anything else.
22069 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
22070 return SDValue();
22071
22072 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
22073 unsigned SatWidth = SatVT.getScalarSizeInBits();
22074 unsigned DstWidth = DstVT.getScalarSizeInBits();
22075 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
22076 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
22077 "Expected saturation width smaller than result width");
22078
22079 // Promote result of FP_TO_*INT to at least 32 bits.
22080 if (TmpWidth < 32) {
22081 TmpVT = MVT::i32;
22082 TmpWidth = 32;
22083 }
22084
22085 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
22086 // us to use a native signed conversion instead.
22087 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
22088 TmpVT = MVT::i64;
22089 TmpWidth = 64;
22090 }
22091
22092 // If the saturation width is smaller than the size of the temporary result,
22093 // we can always use signed conversion, which is native.
22094 if (SatWidth < TmpWidth)
22095 FpToIntOpcode = ISD::FP_TO_SINT;
22096
22097 // Determine minimum and maximum integer values and their corresponding
22098 // floating-point values.
22099 APInt MinInt, MaxInt;
22100 if (IsSigned) {
22101 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
22102 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
22103 } else {
22104 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
22105 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
22106 }
22107
22108 const fltSemantics &Sem = SrcVT.getFltSemantics();
22109 APFloat MinFloat(Sem);
22110 APFloat MaxFloat(Sem);
22111
22112 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
22113 MinInt, IsSigned, APFloat::rmTowardZero);
22114 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
22115 MaxInt, IsSigned, APFloat::rmTowardZero);
22116 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
22117 && !(MaxStatus & APFloat::opStatus::opInexact);
22118
22119 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
22120 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
22121
22122 // If the integer bounds are exactly representable as floats, emit a
22123 // min+max+fptoi sequence. Otherwise use comparisons and selects.
22124 if (AreExactFloatBounds) {
22125 if (DstVT != TmpVT) {
22126 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
22127 SDValue MinClamped = DAG.getNode(
22128 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
22129 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
22130 SDValue BothClamped = DAG.getNode(
22131 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
22132 // Convert clamped value to integer.
22133 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
22134
22135 // NaN will become INDVAL, with the top bit set and the rest zero.
22136 // Truncation will discard the top bit, resulting in zero.
22137 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22138 }
22139
22140 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
22141 SDValue MinClamped = DAG.getNode(
22142 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
22143 // Clamp by MaxFloat from above. NaN cannot occur.
22144 SDValue BothClamped = DAG.getNode(
22145 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
22146 // Convert clamped value to integer.
22147 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
22148
22149 if (!IsSigned) {
22150 // In the unsigned case we're done, because we mapped NaN to MinFloat,
22151 // which is zero.
22152 return FpToInt;
22153 }
22154
22155 // Otherwise, select zero if Src is NaN.
22156 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22157 return DAG.getSelectCC(
22158 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
22159 }
22160
22161 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
22162 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
22163
22164 // Result of direct conversion, which may be selected away.
22165 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
22166
22167 if (DstVT != TmpVT) {
22168 // NaN will become INDVAL, with the top bit set and the rest zero.
22169 // Truncation will discard the top bit, resulting in zero.
22170 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22171 }
22172
22173 SDValue Select = FpToInt;
22174 // For signed conversions where we saturate to the same size as the
22175 // result type of the fptoi instructions, INDVAL coincides with integer
22176 // minimum, so we don't need to explicitly check it.
22177 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
22178 // If Src ULT MinFloat, select MinInt. In particular, this also selects
22179 // MinInt if Src is NaN.
22180 Select = DAG.getSelectCC(
22181 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
22182 }
22183
22184 // If Src OGT MaxFloat, select MaxInt.
22185 Select = DAG.getSelectCC(
22186 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
22187
22188 // In the unsigned case we are done, because we mapped NaN to MinInt, which
22189 // is already zero. The promoted case was already handled above.
22190 if (!IsSigned || DstVT != TmpVT) {
22191 return Select;
22192 }
22193
22194 // Otherwise, select 0 if Src is NaN.
22195 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22196 return DAG.getSelectCC(
22197 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
22198}
22199
22200SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
22201 bool IsStrict = Op->isStrictFPOpcode();
22202
22203 SDLoc DL(Op);
22204 MVT VT = Op.getSimpleValueType();
22205 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22206 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22207 MVT SVT = In.getSimpleValueType();
22208
22209 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
22210 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
22211 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
22212 !Subtarget.getTargetTriple().isOSDarwin()))
22213 return SDValue();
22214
22215 if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
22216 (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
22217 return Op;
22218
22219 if (SVT == MVT::f16) {
22220 if (Subtarget.hasFP16())
22221 return Op;
22222
22223 if (VT != MVT::f32) {
22224 if (IsStrict)
22225 return DAG.getNode(
22226 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
22227 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
22228 {MVT::f32, MVT::Other}, {Chain, In})});
22229
22230 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
22231 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
22232 }
22233
22234 if (!Subtarget.hasF16C()) {
22235 if (!Subtarget.getTargetTriple().isOSDarwin())
22236 return SDValue();
22237
22238 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
22239
22240 // Need a libcall, but ABI for f16 is soft-float on MacOS.
22241 TargetLowering::CallLoweringInfo CLI(DAG);
22242 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22243
22244 In = DAG.getBitcast(MVT::i16, In);
22246 TargetLowering::ArgListEntry Entry(
22247 In, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()));
22248 Entry.IsSExt = false;
22249 Entry.IsZExt = true;
22250 Args.push_back(Entry);
22251
22253 getLibcallName(RTLIB::FPEXT_F16_F32),
22255 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22256 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
22257 std::move(Args));
22258
22259 SDValue Res;
22260 std::tie(Res,Chain) = LowerCallTo(CLI);
22261 if (IsStrict)
22262 Res = DAG.getMergeValues({Res, Chain}, DL);
22263
22264 return Res;
22265 }
22266
22267 In = DAG.getBitcast(MVT::i16, In);
22268 SDValue Res;
22269 if (IsStrict) {
22270 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
22271 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
22272 DAG.getVectorIdxConstant(0, DL));
22273 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
22274 {Chain, In});
22275 Chain = Res.getValue(1);
22276 } else {
22277 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
22278 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
22279 DAG.getUNDEF(MVT::v4i32), In,
22280 DAG.getVectorIdxConstant(0, DL));
22281 In = DAG.getBitcast(MVT::v8i16, In);
22282 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
22283 DAG.getTargetConstant(4, DL, MVT::i32));
22284 }
22285 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
22286 DAG.getVectorIdxConstant(0, DL));
22287 if (IsStrict)
22288 return DAG.getMergeValues({Res, Chain}, DL);
22289 return Res;
22290 }
22291
22292 if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16)
22293 return Op;
22294
22295 if (SVT.getVectorElementType() == MVT::f16) {
22296 if (Subtarget.hasFP16() && isTypeLegal(SVT))
22297 return Op;
22298 assert(Subtarget.hasF16C() && "Unexpected features!");
22299 if (SVT == MVT::v2f16)
22300 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
22301 DAG.getUNDEF(MVT::v2f16));
22302 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
22303 DAG.getUNDEF(MVT::v4f16));
22304 if (IsStrict)
22305 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22306 {Op->getOperand(0), Res});
22307 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22308 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
22309 return Op;
22310 }
22311
22312 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
22313
22314 SDValue Res =
22315 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
22316 if (IsStrict)
22317 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22318 {Op->getOperand(0), Res});
22319 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22320}
22321
22322SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
22323 bool IsStrict = Op->isStrictFPOpcode();
22324
22325 SDLoc DL(Op);
22326 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22327 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22328 MVT VT = Op.getSimpleValueType();
22329 MVT SVT = In.getSimpleValueType();
22330
22331 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
22332 return SDValue();
22333
22334 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
22335 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
22336 if (!Subtarget.getTargetTriple().isOSDarwin())
22337 return SDValue();
22338
22339 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
22340 TargetLowering::CallLoweringInfo CLI(DAG);
22341 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22342
22344 TargetLowering::ArgListEntry Entry(
22345 In, EVT(SVT).getTypeForEVT(*DAG.getContext()));
22346 Entry.IsSExt = false;
22347 Entry.IsZExt = true;
22348 Args.push_back(Entry);
22349
22351 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
22352 : RTLIB::FPROUND_F32_F16),
22354 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22355 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
22356 std::move(Args));
22357
22358 SDValue Res;
22359 std::tie(Res, Chain) = LowerCallTo(CLI);
22360
22361 Res = DAG.getBitcast(MVT::f16, Res);
22362
22363 if (IsStrict)
22364 Res = DAG.getMergeValues({Res, Chain}, DL);
22365
22366 return Res;
22367 }
22368
22369 if (VT.getScalarType() == MVT::bf16) {
22370 if (SVT.getScalarType() == MVT::f32 &&
22371 ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22372 Subtarget.hasAVXNECONVERT()))
22373 return Op;
22374 return SDValue();
22375 }
22376
22377 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
22378 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
22379 return SDValue();
22380
22381 if (VT.isVector())
22382 return Op;
22383
22384 SDValue Res;
22386 MVT::i32);
22387 if (IsStrict) {
22388 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
22389 DAG.getConstantFP(0, DL, MVT::v4f32), In,
22390 DAG.getVectorIdxConstant(0, DL));
22391 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
22392 {Chain, Res, Rnd});
22393 Chain = Res.getValue(1);
22394 } else {
22395 // FIXME: Should we use zeros for upper elements for non-strict?
22396 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
22397 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
22398 }
22399
22400 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22401 DAG.getVectorIdxConstant(0, DL));
22402 Res = DAG.getBitcast(MVT::f16, Res);
22403
22404 if (IsStrict)
22405 return DAG.getMergeValues({Res, Chain}, DL);
22406
22407 return Res;
22408 }
22409
22410 return Op;
22411}
22412
22414 bool IsStrict = Op->isStrictFPOpcode();
22415 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22416 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
22417 "Unexpected VT!");
22418
22419 SDLoc dl(Op);
22420 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
22421 DAG.getConstant(0, dl, MVT::v8i16), Src,
22422 DAG.getVectorIdxConstant(0, dl));
22423
22424 SDValue Chain;
22425 if (IsStrict) {
22426 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
22427 {Op.getOperand(0), Res});
22428 Chain = Res.getValue(1);
22429 } else {
22430 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
22431 }
22432
22433 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
22434 DAG.getVectorIdxConstant(0, dl));
22435
22436 if (IsStrict)
22437 return DAG.getMergeValues({Res, Chain}, dl);
22438
22439 return Res;
22440}
22441
22443 bool IsStrict = Op->isStrictFPOpcode();
22444 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22445 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
22446 "Unexpected VT!");
22447
22448 SDLoc dl(Op);
22449 SDValue Res, Chain;
22450 if (IsStrict) {
22451 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
22452 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
22453 DAG.getVectorIdxConstant(0, dl));
22454 Res = DAG.getNode(
22455 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
22456 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
22457 Chain = Res.getValue(1);
22458 } else {
22459 // FIXME: Should we use zeros for upper elements for non-strict?
22460 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
22461 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
22462 DAG.getTargetConstant(4, dl, MVT::i32));
22463 }
22464
22465 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
22466 DAG.getVectorIdxConstant(0, dl));
22467
22468 if (IsStrict)
22469 return DAG.getMergeValues({Res, Chain}, dl);
22470
22471 return Res;
22472}
22473
22474SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
22475 SelectionDAG &DAG) const {
22476 SDLoc DL(Op);
22477
22478 MVT SVT = Op.getOperand(0).getSimpleValueType();
22479 if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22480 Subtarget.hasAVXNECONVERT())) {
22481 SDValue Res;
22482 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));
22483 Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);
22484 Res = DAG.getBitcast(MVT::v8i16, Res);
22485 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22486 DAG.getVectorIdxConstant(0, DL));
22487 }
22488
22489 MakeLibCallOptions CallOptions;
22490 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
22491 SDValue Res =
22492 makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
22493 return DAG.getBitcast(MVT::i16, Res);
22494}
22495
22496/// Depending on uarch and/or optimizing for size, we might prefer to use a
22497/// vector operation in place of the typical scalar operation.
22499 SelectionDAG &DAG,
22500 const X86Subtarget &Subtarget) {
22501 // If both operands have other uses, this is probably not profitable.
22502 SDValue LHS = Op.getOperand(0);
22503 SDValue RHS = Op.getOperand(1);
22504 if (!LHS.hasOneUse() && !RHS.hasOneUse())
22505 return Op;
22506
22507 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
22508 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
22509 if (IsFP && !Subtarget.hasSSE3())
22510 return Op;
22511 if (!IsFP && !Subtarget.hasSSSE3())
22512 return Op;
22513
22514 // Extract from a common vector.
22515 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22516 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22517 LHS.getOperand(0) != RHS.getOperand(0) ||
22518 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
22519 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
22520 !shouldUseHorizontalOp(true, DAG, Subtarget))
22521 return Op;
22522
22523 // Allow commuted 'hadd' ops.
22524 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
22525 unsigned HOpcode;
22526 switch (Op.getOpcode()) {
22527 // clang-format off
22528 case ISD::ADD: HOpcode = X86ISD::HADD; break;
22529 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
22530 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
22531 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
22532 default:
22533 llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
22534 // clang-format on
22535 }
22536 unsigned LExtIndex = LHS.getConstantOperandVal(1);
22537 unsigned RExtIndex = RHS.getConstantOperandVal(1);
22538 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
22539 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
22540 std::swap(LExtIndex, RExtIndex);
22541
22542 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
22543 return Op;
22544
22545 SDValue X = LHS.getOperand(0);
22546 EVT VecVT = X.getValueType();
22547 unsigned BitWidth = VecVT.getSizeInBits();
22548 unsigned NumLanes = BitWidth / 128;
22549 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
22550 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
22551 "Not expecting illegal vector widths here");
22552
22553 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
22554 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
22555 if (BitWidth == 256 || BitWidth == 512) {
22556 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
22557 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
22558 LExtIndex %= NumEltsPerLane;
22559 }
22560
22561 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
22562 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
22563 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
22564 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
22565 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
22566 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
22567 DAG.getVectorIdxConstant(LExtIndex / 2, DL));
22568}
22569
22570/// Depending on uarch and/or optimizing for size, we might prefer to use a
22571/// vector operation in place of the typical scalar operation.
22572SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
22573 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
22574 "Only expecting float/double");
22575 return lowerAddSubToHorizontalOp(Op, SDLoc(Op), DAG, Subtarget);
22576}
22577
22578/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
22579/// This mode isn't supported in hardware on X86. But as long as we aren't
22580/// compiling with trapping math, we can emulate this with
22581/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
22583 SDValue N0 = Op.getOperand(0);
22584 SDLoc dl(Op);
22585 MVT VT = Op.getSimpleValueType();
22586
22587 // N0 += copysign(nextafter(0.5, 0.0), N0)
22588 const fltSemantics &Sem = VT.getFltSemantics();
22589 bool Ignored;
22590 APFloat Point5Pred = APFloat(0.5f);
22591 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22592 Point5Pred.next(/*nextDown*/true);
22593
22594 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22595 DAG.getConstantFP(Point5Pred, dl, VT), N0);
22596 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22597
22598 // Truncate the result to remove fraction.
22599 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22600}
22601
22602/// The only differences between FABS and FNEG are the mask and the logic op.
22603/// FNEG also has a folding opportunity for FNEG(FABS(x)).
22605 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
22606 "Wrong opcode for lowering FABS or FNEG.");
22607
22608 bool IsFABS = (Op.getOpcode() == ISD::FABS);
22609
22610 // If this is a FABS and it has an FNEG user, bail out to fold the combination
22611 // into an FNABS. We'll lower the FABS after that if it is still in use.
22612 if (IsFABS)
22613 for (SDNode *User : Op->users())
22614 if (User->getOpcode() == ISD::FNEG)
22615 return Op;
22616
22617 SDLoc dl(Op);
22618 MVT VT = Op.getSimpleValueType();
22619
22620 bool IsF128 = (VT == MVT::f128);
22621 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22623 "Unexpected type in LowerFABSorFNEG");
22624
22625 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to
22626 // decide if we should generate a 16-byte constant mask when we only need 4 or
22627 // 8 bytes for the scalar case.
22628
22629 // There are no scalar bitwise logical SSE/AVX instructions, so we
22630 // generate a 16-byte vector constant and logic op even for the scalar case.
22631 // Using a 16-byte mask allows folding the load of the mask with
22632 // the logic op, so it can save (~4 bytes) on code size.
22633 bool IsFakeVector = !VT.isVector() && !IsF128;
22634 MVT LogicVT = VT;
22635 if (IsFakeVector)
22636 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22637 : (VT == MVT::f32) ? MVT::v4f32
22638 : MVT::v8f16;
22639
22640 unsigned EltBits = VT.getScalarSizeInBits();
22641 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22642 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22643 APInt::getSignMask(EltBits);
22644 const fltSemantics &Sem = VT.getFltSemantics();
22645 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22646
22647 SDValue Op0 = Op.getOperand(0);
22648 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22649 unsigned LogicOp = IsFABS ? X86ISD::FAND :
22650 IsFNABS ? X86ISD::FOR :
22652 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22653
22654 if (VT.isVector() || IsF128)
22655 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22656
22657 // For the scalar case extend to a 128-bit vector, perform the logic op,
22658 // and extract the scalar result back out.
22659 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22660 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22661 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22662 DAG.getVectorIdxConstant(0, dl));
22663}
22664
22666 SDValue Mag = Op.getOperand(0);
22667 SDValue Sign = Op.getOperand(1);
22668 SDLoc dl(Op);
22669
22670 // If the sign operand is smaller, extend it first.
22671 MVT VT = Op.getSimpleValueType();
22672 if (Sign.getSimpleValueType().bitsLT(VT))
22673 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22674
22675 // And if it is bigger, shrink it first.
22676 if (Sign.getSimpleValueType().bitsGT(VT))
22677 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
22678 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
22679
22680 // At this point the operands and the result should have the same
22681 // type, and that won't be f80 since that is not custom lowered.
22682 bool IsF128 = (VT == MVT::f128);
22683 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22685 "Unexpected type in LowerFCOPYSIGN");
22686
22687 const fltSemantics &Sem = VT.getFltSemantics();
22688
22689 // Perform all scalar logic operations as 16-byte vectors because there are no
22690 // scalar FP logic instructions in SSE.
22691 // TODO: This isn't necessary. If we used scalar types, we might avoid some
22692 // unnecessary splats, but we might miss load folding opportunities. Should
22693 // this decision be based on OptimizeForSize?
22694 bool IsFakeVector = !VT.isVector() && !IsF128;
22695 MVT LogicVT = VT;
22696 if (IsFakeVector)
22697 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22698 : (VT == MVT::f32) ? MVT::v4f32
22699 : MVT::v8f16;
22700
22701 // The mask constants are automatically splatted for vector types.
22702 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22703 SDValue SignMask = DAG.getConstantFP(
22704 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22705 SDValue MagMask = DAG.getConstantFP(
22706 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22707
22708 // First, clear all bits but the sign bit from the second operand (sign).
22709 if (IsFakeVector)
22710 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22711 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22712
22713 // Next, clear the sign bit from the first operand (magnitude).
22714 // TODO: If we had general constant folding for FP logic ops, this check
22715 // wouldn't be necessary.
22716 SDValue MagBits;
22717 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22718 APFloat APF = Op0CN->getValueAPF();
22719 APF.clearSign();
22720 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22721 } else {
22722 // If the magnitude operand wasn't a constant, we need to AND out the sign.
22723 if (IsFakeVector)
22724 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22725 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22726 }
22727
22728 // OR the magnitude value with the sign bit.
22729 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22730 return !IsFakeVector ? Or
22731 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22732 DAG.getVectorIdxConstant(0, dl));
22733}
22734
22736 SDValue N0 = Op.getOperand(0);
22737 SDLoc dl(Op);
22738 MVT VT = Op.getSimpleValueType();
22739
22740 MVT OpVT = N0.getSimpleValueType();
22741 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
22742 "Unexpected type for FGETSIGN");
22743
22744 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22745 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22746 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22747 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22748 Res = DAG.getZExtOrTrunc(Res, dl, VT);
22749 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22750 return Res;
22751}
22752
22753/// Helper for attempting to create a X86ISD::BT node.
22754static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
22755 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
22756 // instruction. Since the shift amount is in-range-or-undefined, we know
22757 // that doing a bittest on the i32 value is ok. We extend to i32 because
22758 // the encoding for the i16 version is larger than the i32 version.
22759 // Also promote i16 to i32 for performance / code size reason.
22760 if (Src.getValueType().getScalarSizeInBits() < 32)
22761 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
22762
22763 // No legal type found, give up.
22764 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
22765 return SDValue();
22766
22767 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22768 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22769 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22770 // known to be zero.
22771 if (Src.getValueType() == MVT::i64 &&
22772 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22773 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
22774
22775 // If the operand types disagree, extend the shift amount to match. Since
22776 // BT ignores high bits (like shifts) we can use anyextend.
22777 if (Src.getValueType() != BitNo.getValueType()) {
22778 // Peek through a mask/modulo operation.
22779 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
22780 // we probably need a better IsDesirableToPromoteOp to handle this as well.
22781 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
22782 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
22783 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22784 BitNo.getOperand(0)),
22785 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22786 BitNo.getOperand(1)));
22787 else
22788 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
22789 }
22790
22791 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
22792}
22793
22794/// Helper for creating a X86ISD::SETCC node.
22796 SelectionDAG &DAG) {
22797 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22798 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22799}
22800
22801/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
22802/// recognizable memcmp expansion.
22803static bool isOrXorXorTree(SDValue X, bool Root = true) {
22804 if (X.getOpcode() == ISD::OR)
22805 return isOrXorXorTree(X.getOperand(0), false) &&
22806 isOrXorXorTree(X.getOperand(1), false);
22807 if (Root)
22808 return false;
22809 return X.getOpcode() == ISD::XOR;
22810}
22811
22812/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
22813/// expansion.
22814template <typename F>
22816 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
22817 SDValue Op0 = X.getOperand(0);
22818 SDValue Op1 = X.getOperand(1);
22819 if (X.getOpcode() == ISD::OR) {
22820 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22821 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22822 if (VecVT != CmpVT)
22823 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
22824 if (HasPT)
22825 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
22826 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
22827 }
22828 if (X.getOpcode() == ISD::XOR) {
22829 SDValue A = SToV(Op0);
22830 SDValue B = SToV(Op1);
22831 if (VecVT != CmpVT)
22832 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
22833 if (HasPT)
22834 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
22835 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
22836 }
22837 llvm_unreachable("Impossible");
22838}
22839
22840/// Try to map a 128-bit or larger integer comparison to vector instructions
22841/// before type legalization splits it up into chunks.
22843 ISD::CondCode CC,
22844 const SDLoc &DL,
22845 SelectionDAG &DAG,
22846 const X86Subtarget &Subtarget) {
22847 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
22848
22849 // We're looking for an oversized integer equality comparison.
22850 EVT OpVT = X.getValueType();
22851 unsigned OpSize = OpVT.getSizeInBits();
22852 if (!OpVT.isScalarInteger() || OpSize < 128)
22853 return SDValue();
22854
22855 // Ignore a comparison with zero because that gets special treatment in
22856 // EmitTest(). But make an exception for the special case of a pair of
22857 // logically-combined vector-sized operands compared to zero. This pattern may
22858 // be generated by the memcmp expansion pass with oversized integer compares
22859 // (see PR33325).
22860 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
22861 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
22862 return SDValue();
22863
22864 // Don't perform this combine if constructing the vector will be expensive.
22865 auto IsVectorBitCastCheap = [](SDValue X) {
22867 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
22868 X.getOpcode() == ISD::LOAD;
22869 };
22870 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
22871 !IsOrXorXorTreeCCZero)
22872 return SDValue();
22873
22874 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22875 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22876 // Otherwise use PCMPEQ (plus AND) and mask testing.
22877 bool NoImplicitFloatOps =
22879 Attribute::NoImplicitFloat);
22880 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
22881 ((OpSize == 128 && Subtarget.hasSSE2()) ||
22882 (OpSize == 256 && Subtarget.hasAVX()) ||
22883 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
22884 bool HasPT = Subtarget.hasSSE41();
22885
22886 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
22887 // vector registers are essentially free. (Technically, widening registers
22888 // prevents load folding, but the tradeoff is worth it.)
22889 bool PreferKOT = Subtarget.preferMaskRegisters();
22890 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
22891
22892 EVT VecVT = MVT::v16i8;
22893 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
22894 if (OpSize == 256) {
22895 VecVT = MVT::v32i8;
22896 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
22897 }
22898 EVT CastVT = VecVT;
22899 bool NeedsAVX512FCast = false;
22900 if (OpSize == 512 || NeedZExt) {
22901 if (Subtarget.hasBWI()) {
22902 VecVT = MVT::v64i8;
22903 CmpVT = MVT::v64i1;
22904 if (OpSize == 512)
22905 CastVT = VecVT;
22906 } else {
22907 VecVT = MVT::v16i32;
22908 CmpVT = MVT::v16i1;
22909 CastVT = OpSize == 512 ? VecVT
22910 : OpSize == 256 ? MVT::v8i32
22911 : MVT::v4i32;
22912 NeedsAVX512FCast = true;
22913 }
22914 }
22915
22916 auto ScalarToVector = [&](SDValue X) -> SDValue {
22917 bool TmpZext = false;
22918 EVT TmpCastVT = CastVT;
22919 if (X.getOpcode() == ISD::ZERO_EXTEND) {
22920 SDValue OrigX = X.getOperand(0);
22921 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
22922 if (OrigSize < OpSize) {
22923 if (OrigSize == 128) {
22924 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
22925 X = OrigX;
22926 TmpZext = true;
22927 } else if (OrigSize == 256) {
22928 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
22929 X = OrigX;
22930 TmpZext = true;
22931 }
22932 }
22933 }
22934 X = DAG.getBitcast(TmpCastVT, X);
22935 if (!NeedZExt && !TmpZext)
22936 return X;
22937 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
22938 DAG.getConstant(0, DL, VecVT), X,
22939 DAG.getVectorIdxConstant(0, DL));
22940 };
22941
22942 SDValue Cmp;
22943 if (IsOrXorXorTreeCCZero) {
22944 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22945 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
22946 // Use 2 vector equality compares and 'and' the results before doing a
22947 // MOVMSK.
22948 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
22949 } else {
22950 SDValue VecX = ScalarToVector(X);
22951 SDValue VecY = ScalarToVector(Y);
22952 if (VecVT != CmpVT) {
22953 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
22954 } else if (HasPT) {
22955 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
22956 } else {
22957 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
22958 }
22959 }
22960 // AVX512 should emit a setcc that will lower to kortest.
22961 if (VecVT != CmpVT) {
22962 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
22963 : CmpVT == MVT::v32i1 ? MVT::i32
22964 : MVT::i16;
22965 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
22966 DAG.getConstant(0, DL, KRegVT), CC);
22967 }
22968 if (HasPT) {
22969 SDValue BCCmp =
22970 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
22971 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
22973 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
22974 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
22975 }
22976 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
22977 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22978 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22979 assert(Cmp.getValueType() == MVT::v16i8 &&
22980 "Non 128-bit vector on pre-SSE41 target");
22981 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
22982 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
22983 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
22984 }
22985
22986 return SDValue();
22987}
22988
22989/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
22990/// style scalarized (associative) reduction patterns. Partial reductions
22991/// are supported when the pointer SrcMask is non-null.
22992/// TODO - move this to SelectionDAG?
22995 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22997 DenseMap<SDValue, APInt> SrcOpMap;
22998 EVT VT = MVT::Other;
22999
23000 // Recognize a special case where a vector is casted into wide integer to
23001 // test all 0s.
23002 assert(Op.getOpcode() == unsigned(BinOp) &&
23003 "Unexpected bit reduction opcode");
23004 Opnds.push_back(Op.getOperand(0));
23005 Opnds.push_back(Op.getOperand(1));
23006
23007 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
23009 // BFS traverse all BinOp operands.
23010 if (I->getOpcode() == unsigned(BinOp)) {
23011 Opnds.push_back(I->getOperand(0));
23012 Opnds.push_back(I->getOperand(1));
23013 // Re-evaluate the number of nodes to be traversed.
23014 e += 2; // 2 more nodes (LHS and RHS) are pushed.
23015 continue;
23016 }
23017
23018 // Quit if a non-EXTRACT_VECTOR_ELT
23019 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
23020 return false;
23021
23022 // Quit if without a constant index.
23023 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
23024 if (!Idx)
23025 return false;
23026
23027 SDValue Src = I->getOperand(0);
23028 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
23029 if (M == SrcOpMap.end()) {
23030 VT = Src.getValueType();
23031 // Quit if not the same type.
23032 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
23033 return false;
23034 unsigned NumElts = VT.getVectorNumElements();
23035 APInt EltCount = APInt::getZero(NumElts);
23036 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
23037 SrcOps.push_back(Src);
23038 }
23039
23040 // Quit if element already used.
23041 unsigned CIdx = Idx->getZExtValue();
23042 if (M->second[CIdx])
23043 return false;
23044 M->second.setBit(CIdx);
23045 }
23046
23047 if (SrcMask) {
23048 // Collect the source partial masks.
23049 for (SDValue &SrcOp : SrcOps)
23050 SrcMask->push_back(SrcOpMap[SrcOp]);
23051 } else {
23052 // Quit if not all elements are used.
23053 for (const auto &I : SrcOpMap)
23054 if (!I.second.isAllOnes())
23055 return false;
23056 }
23057
23058 return true;
23059}
23060
23061// Helper function for comparing all bits of two vectors.
23063 ISD::CondCode CC, const APInt &OriginalMask,
23064 const X86Subtarget &Subtarget,
23065 SelectionDAG &DAG, X86::CondCode &X86CC) {
23066 EVT VT = LHS.getValueType();
23067 unsigned ScalarSize = VT.getScalarSizeInBits();
23068 if (OriginalMask.getBitWidth() != ScalarSize) {
23069 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
23070 return SDValue();
23071 }
23072
23073 // Quit if not convertable to legal scalar or 128/256-bit vector.
23075 return SDValue();
23076
23077 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
23078 if (VT.isFloatingPoint())
23079 return SDValue();
23080
23081 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
23082 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
23083
23084 APInt Mask = OriginalMask;
23085
23086 auto MaskBits = [&](SDValue Src) {
23087 if (Mask.isAllOnes())
23088 return Src;
23089 EVT SrcVT = Src.getValueType();
23090 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
23091 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
23092 };
23093
23094 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
23095 if (VT.getSizeInBits() < 128) {
23096 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
23097 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
23098 if (IntVT != MVT::i64)
23099 return SDValue();
23100 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
23101 MVT::i32, MVT::i32);
23102 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
23103 MVT::i32, MVT::i32);
23104 SDValue Lo =
23105 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
23106 SDValue Hi =
23107 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
23108 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23109 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
23110 DAG.getConstant(0, DL, MVT::i32));
23111 }
23112 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23113 DAG.getBitcast(IntVT, MaskBits(LHS)),
23114 DAG.getBitcast(IntVT, MaskBits(RHS)));
23115 }
23116
23117 // Without PTEST, a masked v2i64 or-reduction is not faster than
23118 // scalarization.
23119 bool UseKORTEST = Subtarget.useAVX512Regs();
23120 bool UsePTEST = Subtarget.hasSSE41();
23121 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
23122 return SDValue();
23123
23124 // Split down to 128/256/512-bit vector.
23125 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
23126
23127 // If the input vector has vector elements wider than the target test size,
23128 // then cast to <X x i64> so it will safely split.
23129 if (ScalarSize > TestSize) {
23130 if (!Mask.isAllOnes())
23131 return SDValue();
23132 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
23133 LHS = DAG.getBitcast(VT, LHS);
23134 RHS = DAG.getBitcast(VT, RHS);
23135 Mask = APInt::getAllOnes(64);
23136 }
23137
23138 if (VT.getSizeInBits() > TestSize) {
23139 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
23140 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
23141 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
23142 while (VT.getSizeInBits() > TestSize) {
23143 auto Split = DAG.SplitVector(LHS, DL);
23144 VT = Split.first.getValueType();
23145 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23146 }
23147 RHS = DAG.getAllOnesConstant(DL, VT);
23148 } else if (!UsePTEST && !KnownRHS.isZero()) {
23149 // MOVMSK Special Case:
23150 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
23151 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
23152 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
23153 LHS = DAG.getBitcast(VT, MaskBits(LHS));
23154 RHS = DAG.getBitcast(VT, MaskBits(RHS));
23155 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
23156 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
23157 V = DAG.getSExtOrTrunc(V, DL, VT);
23158 while (VT.getSizeInBits() > TestSize) {
23159 auto Split = DAG.SplitVector(V, DL);
23160 VT = Split.first.getValueType();
23161 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23162 }
23163 V = DAG.getNOT(DL, V, VT);
23164 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23165 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23166 DAG.getConstant(0, DL, MVT::i32));
23167 } else {
23168 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
23169 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
23170 while (VT.getSizeInBits() > TestSize) {
23171 auto Split = DAG.SplitVector(V, DL);
23172 VT = Split.first.getValueType();
23173 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
23174 }
23175 LHS = V;
23176 RHS = DAG.getConstant(0, DL, VT);
23177 }
23178 }
23179
23180 if (UseKORTEST && VT.is512BitVector()) {
23181 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
23182 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
23183 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23184 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23185 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
23186 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
23187 }
23188
23189 if (UsePTEST) {
23190 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
23191 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23192 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23193 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
23194 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
23195 }
23196
23197 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
23198 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
23199 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
23200 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
23201 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
23202 V = DAG.getNOT(DL, V, MaskVT);
23203 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23204 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23205 DAG.getConstant(0, DL, MVT::i32));
23206}
23207
23208// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
23209// to CMP(MOVMSK(PCMPEQB(X,Y))).
23211 ISD::CondCode CC, const SDLoc &DL,
23212 const X86Subtarget &Subtarget,
23213 SelectionDAG &DAG,
23214 X86::CondCode &X86CC) {
23215 SDValue Op = OrigLHS;
23216
23217 bool CmpNull;
23218 APInt Mask;
23219 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
23220 CmpNull = isNullConstant(OrigRHS);
23221 if (!CmpNull && !isAllOnesConstant(OrigRHS))
23222 return SDValue();
23223
23224 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
23225 return SDValue();
23226
23227 // Check whether we're masking/truncating an OR-reduction result, in which
23228 // case track the masked bits.
23229 // TODO: Add CmpAllOnes support.
23230 Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
23231 if (CmpNull) {
23232 switch (Op.getOpcode()) {
23233 case ISD::TRUNCATE: {
23234 SDValue Src = Op.getOperand(0);
23235 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
23236 Op.getScalarValueSizeInBits());
23237 Op = Src;
23238 break;
23239 }
23240 case ISD::AND: {
23241 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
23242 Mask = Cst->getAPIntValue();
23243 Op = Op.getOperand(0);
23244 }
23245 break;
23246 }
23247 }
23248 }
23249 } else if (CC == ISD::SETGT && isAllOnesConstant(OrigRHS)) {
23250 CC = ISD::SETEQ;
23251 CmpNull = true;
23252 Mask = APInt::getSignMask(Op.getScalarValueSizeInBits());
23253 } else {
23254 return SDValue();
23255 }
23256
23257 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
23258
23259 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
23260 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
23262 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
23263 EVT VT = VecIns[0].getValueType();
23264 assert(llvm::all_of(VecIns,
23265 [VT](SDValue V) { return VT == V.getValueType(); }) &&
23266 "Reduction source vector mismatch");
23267
23268 // Quit if not splittable to scalar/128/256/512-bit vector.
23270 return SDValue();
23271
23272 // If more than one full vector is evaluated, AND/OR them first before
23273 // PTEST.
23274 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
23275 Slot += 2, e += 1) {
23276 // Each iteration will AND/OR 2 nodes and append the result until there is
23277 // only 1 node left, i.e. the final value of all vectors.
23278 SDValue LHS = VecIns[Slot];
23279 SDValue RHS = VecIns[Slot + 1];
23280 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
23281 }
23282
23283 return LowerVectorAllEqual(DL, VecIns.back(),
23284 CmpNull ? DAG.getConstant(0, DL, VT)
23285 : DAG.getAllOnesConstant(DL, VT),
23286 CC, Mask, Subtarget, DAG, X86CC);
23287 }
23288
23289 // Match icmp(reduce_or(X),0) anyof reduction patterns.
23290 // Match icmp(reduce_and(X),-1) allof reduction patterns.
23291 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23292 ISD::NodeType BinOp;
23293 if (SDValue Match =
23294 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
23295 EVT MatchVT = Match.getValueType();
23296 return LowerVectorAllEqual(DL, Match,
23297 CmpNull ? DAG.getConstant(0, DL, MatchVT)
23298 : DAG.getAllOnesConstant(DL, MatchVT),
23299 CC, Mask, Subtarget, DAG, X86CC);
23300 }
23301 }
23302
23303 if (Mask.isAllOnes()) {
23304 assert(!Op.getValueType().isVector() &&
23305 "Illegal vector type for reduction pattern");
23307 if (Src.getValueType().isFixedLengthVector() &&
23308 Src.getValueType().getScalarType() == MVT::i1) {
23309 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
23310 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
23311 if (Src.getOpcode() == ISD::SETCC) {
23312 SDValue LHS = Src.getOperand(0);
23313 SDValue RHS = Src.getOperand(1);
23314 EVT LHSVT = LHS.getValueType();
23315 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
23316 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
23318 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
23319 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
23320 X86CC);
23321 }
23322 }
23323 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
23324 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
23325 // Peek through truncation, mask the LSB and compare against zero/LSB.
23326 if (Src.getOpcode() == ISD::TRUNCATE) {
23327 SDValue Inner = Src.getOperand(0);
23328 EVT InnerVT = Inner.getValueType();
23330 unsigned BW = InnerVT.getScalarSizeInBits();
23331 APInt SrcMask = APInt(BW, 1);
23332 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
23333 return LowerVectorAllEqual(DL, Inner,
23334 DAG.getConstant(Cmp, DL, InnerVT), CC,
23335 SrcMask, Subtarget, DAG, X86CC);
23336 }
23337 }
23338 }
23339 }
23340
23341 return SDValue();
23342}
23343
23344/// return true if \c Op has a use that doesn't just read flags.
23346 for (SDUse &Use : Op->uses()) {
23347 SDNode *User = Use.getUser();
23348 unsigned UOpNo = Use.getOperandNo();
23349 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
23350 // Look past truncate.
23351 UOpNo = User->use_begin()->getOperandNo();
23352 User = User->use_begin()->getUser();
23353 }
23354
23355 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
23356 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
23357 return true;
23358 }
23359 return false;
23360}
23361
23362// Transform to an x86-specific ALU node with flags if there is a chance of
23363// using an RMW op or only the flags are used. Otherwise, leave
23364// the node alone and emit a 'cmp' or 'test' instruction.
23366 for (SDNode *U : Op->users())
23367 if (U->getOpcode() != ISD::CopyToReg &&
23368 U->getOpcode() != ISD::SETCC &&
23369 U->getOpcode() != ISD::STORE)
23370 return false;
23371
23372 return true;
23373}
23374
23375/// Emit nodes that will be selected as "test Op0,Op0", or something
23376/// equivalent.
23378 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
23379 // CF and OF aren't always set the way we want. Determine which
23380 // of these we need.
23381 bool NeedCF = false;
23382 bool NeedOF = false;
23383 switch (X86CC) {
23384 default: break;
23385 case X86::COND_A: case X86::COND_AE:
23386 case X86::COND_B: case X86::COND_BE:
23387 NeedCF = true;
23388 break;
23389 case X86::COND_G: case X86::COND_GE:
23390 case X86::COND_L: case X86::COND_LE:
23391 case X86::COND_O: case X86::COND_NO: {
23392 // Check if we really need to set the
23393 // Overflow flag. If NoSignedWrap is present
23394 // that is not actually needed.
23395 switch (Op->getOpcode()) {
23396 case ISD::ADD:
23397 case ISD::SUB:
23398 case ISD::MUL:
23399 case ISD::SHL:
23400 if (Op.getNode()->getFlags().hasNoSignedWrap())
23401 break;
23402 [[fallthrough]];
23403 default:
23404 NeedOF = true;
23405 break;
23406 }
23407 break;
23408 }
23409 }
23410 // See if we can use the EFLAGS value from the operand instead of
23411 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
23412 // we prove that the arithmetic won't overflow, we can't use OF or CF.
23413 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
23414 // Emit a CMP with 0, which is the TEST pattern.
23415 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23416 DAG.getConstant(0, dl, Op.getValueType()));
23417 }
23418 unsigned Opcode = 0;
23419 unsigned NumOperands = 0;
23420
23421 SDValue ArithOp = Op;
23422
23423 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
23424 // which may be the result of a CAST. We use the variable 'Op', which is the
23425 // non-casted variable when we check for possible users.
23426 switch (ArithOp.getOpcode()) {
23427 case ISD::AND:
23428 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
23429 // because a TEST instruction will be better.
23430 if (!hasNonFlagsUse(Op))
23431 break;
23432
23433 [[fallthrough]];
23434 case ISD::ADD:
23435 case ISD::SUB:
23436 case ISD::OR:
23437 case ISD::XOR:
23439 break;
23440
23441 // Otherwise use a regular EFLAGS-setting instruction.
23442 switch (ArithOp.getOpcode()) {
23443 // clang-format off
23444 default: llvm_unreachable("unexpected operator!");
23445 case ISD::ADD: Opcode = X86ISD::ADD; break;
23446 case ISD::SUB: Opcode = X86ISD::SUB; break;
23447 case ISD::XOR: Opcode = X86ISD::XOR; break;
23448 case ISD::AND: Opcode = X86ISD::AND; break;
23449 case ISD::OR: Opcode = X86ISD::OR; break;
23450 // clang-format on
23451 }
23452
23453 NumOperands = 2;
23454 break;
23455 case X86ISD::ADD:
23456 case X86ISD::SUB:
23457 case X86ISD::OR:
23458 case X86ISD::XOR:
23459 case X86ISD::AND:
23460 return SDValue(Op.getNode(), 1);
23461 case ISD::SSUBO:
23462 case ISD::USUBO: {
23463 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
23464 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23465 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
23466 Op->getOperand(1)).getValue(1);
23467 }
23468 default:
23469 break;
23470 }
23471
23472 if (Opcode == 0) {
23473 // Emit a CMP with 0, which is the TEST pattern.
23474 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23475 DAG.getConstant(0, dl, Op.getValueType()));
23476 }
23477 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23478 SmallVector<SDValue, 4> Ops(Op->ops().take_front(NumOperands));
23479
23480 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
23481 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
23482 return SDValue(New.getNode(), 1);
23483}
23484
23485/// Emit nodes that will be selected as "cmp Op0,Op1", or something
23486/// equivalent.
23488 const SDLoc &dl, SelectionDAG &DAG,
23489 const X86Subtarget &Subtarget) {
23490 if (isNullConstant(Op1))
23491 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
23492
23493 EVT CmpVT = Op0.getValueType();
23494
23495 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
23496 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
23497
23498 // Only promote the compare up to I32 if it is a 16 bit operation
23499 // with an immediate. 16 bit immediates are to be avoided unless the target
23500 // isn't slowed down by length changing prefixes, we're optimizing for
23501 // codesize or the comparison is with a folded load.
23502 if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&
23503 !X86::mayFoldLoad(Op0, Subtarget) && !X86::mayFoldLoad(Op1, Subtarget) &&
23505 auto *COp0 = dyn_cast<ConstantSDNode>(Op0);
23506 auto *COp1 = dyn_cast<ConstantSDNode>(Op1);
23507 // Don't do this if the immediate can fit in 8-bits.
23508 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
23509 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
23510 unsigned ExtendOp =
23512 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
23513 // For equality comparisons try to use SIGN_EXTEND if the input was
23514 // truncate from something with enough sign bits.
23515 if (Op0.getOpcode() == ISD::TRUNCATE) {
23516 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
23517 ExtendOp = ISD::SIGN_EXTEND;
23518 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
23519 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
23520 ExtendOp = ISD::SIGN_EXTEND;
23521 }
23522 }
23523
23524 CmpVT = MVT::i32;
23525 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
23526 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
23527 }
23528 }
23529
23530 // Try to shrink i64 compares if the input has enough zero bits.
23531 if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) &&
23532 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23533 DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) &&
23534 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
23535 CmpVT = MVT::i32;
23536 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23537 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23538 }
23539
23540 // Try to shrink all i64 compares if the inputs are representable as signed
23541 // i32.
23542 if (CmpVT == MVT::i64 &&
23543 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23544 DAG.ComputeNumSignBits(Op1) > 32 && DAG.ComputeNumSignBits(Op0) > 32) {
23545 CmpVT = MVT::i32;
23546 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23547 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23548 }
23549
23550 // 0-x == y --> x+y == 0
23551 // 0-x != y --> x+y != 0
23552 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
23553 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23554 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23555 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
23556 return Add.getValue(1);
23557 }
23558
23559 // x == 0-y --> x+y == 0
23560 // x != 0-y --> x+y != 0
23561 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
23562 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23563 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23564 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
23565 return Add.getValue(1);
23566 }
23567
23568 // If we already have an XOR of the ops, use that to check for equality.
23569 // Else use SUB instead of CMP to enable CSE between SUB and CMP.
23570 unsigned X86Opc = X86ISD::SUB;
23571 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
23572 (DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op0, Op1}) ||
23573 DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op1, Op0})))
23574 X86Opc = X86ISD::XOR;
23575
23576 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23577 SDValue CmpOp = DAG.getNode(X86Opc, dl, VTs, Op0, Op1);
23578 return CmpOp.getValue(1);
23579}
23580
23585
23586bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(
23587 SDNode *N, SDValue, SDValue IntPow2) const {
23588 if (N->getOpcode() == ISD::FDIV)
23589 return true;
23590
23591 EVT FPVT = N->getValueType(0);
23592 EVT IntVT = IntPow2.getValueType();
23593
23594 // This indicates a non-free bitcast.
23595 // TODO: This is probably overly conservative as we will need to scale the
23596 // integer vector anyways for the int->fp cast.
23597 if (FPVT.isVector() &&
23598 FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())
23599 return false;
23600
23601 return true;
23602}
23603
23604/// Check if replacement of SQRT with RSQRT should be disabled.
23605bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
23606 EVT VT = Op.getValueType();
23607
23608 // We don't need to replace SQRT with RSQRT for half type.
23609 if (VT.getScalarType() == MVT::f16)
23610 return true;
23611
23612 // We never want to use both SQRT and RSQRT instructions for the same input.
23613 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
23614 return false;
23615
23616 if (VT.isVector())
23617 return Subtarget.hasFastVectorFSQRT();
23618 return Subtarget.hasFastScalarFSQRT();
23619}
23620
23621/// The minimum architected relative accuracy is 2^-12. We need one
23622/// Newton-Raphson step to have a good float result (24 bits of precision).
23623SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
23624 SelectionDAG &DAG, int Enabled,
23625 int &RefinementSteps,
23626 bool &UseOneConstNR,
23627 bool Reciprocal) const {
23628 SDLoc DL(Op);
23629 EVT VT = Op.getValueType();
23630
23631 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
23632 // It is likely not profitable to do this for f64 because a double-precision
23633 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
23634 // instructions: convert to single, rsqrtss, convert back to double, refine
23635 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
23636 // along with FMA, this could be a throughput win.
23637 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
23638 // after legalize types.
23639 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23640 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
23641 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
23642 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23643 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23644 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23645 RefinementSteps = 1;
23646
23647 UseOneConstNR = false;
23648 // There is no FSQRT for 512-bits, but there is RSQRT14.
23649 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
23650 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
23651 if (RefinementSteps == 0 && !Reciprocal)
23652 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
23653 return Estimate;
23654 }
23655
23656 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23657 Subtarget.hasFP16()) {
23658 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
23659 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23660 RefinementSteps = 0;
23661
23662 if (VT == MVT::f16) {
23664 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23665 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23666 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
23667 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23668 }
23669
23670 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
23671 }
23672 return SDValue();
23673}
23674
23675/// The minimum architected relative accuracy is 2^-12. We need one
23676/// Newton-Raphson step to have a good float result (24 bits of precision).
23677SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
23678 int Enabled,
23679 int &RefinementSteps) const {
23680 SDLoc DL(Op);
23681 EVT VT = Op.getValueType();
23682
23683 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
23684 // It is likely not profitable to do this for f64 because a double-precision
23685 // reciprocal estimate with refinement on x86 prior to FMA requires
23686 // 15 instructions: convert to single, rcpss, convert back to double, refine
23687 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
23688 // along with FMA, this could be a throughput win.
23689
23690 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23691 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
23692 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23693 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23694 // Enable estimate codegen with 1 refinement step for vector division.
23695 // Scalar division estimates are disabled because they break too much
23696 // real-world code. These defaults are intended to match GCC behavior.
23697 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
23698 return SDValue();
23699
23700 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23701 RefinementSteps = 1;
23702
23703 // There is no FSQRT for 512-bits, but there is RCP14.
23704 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
23705 return DAG.getNode(Opcode, DL, VT, Op);
23706 }
23707
23708 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23709 Subtarget.hasFP16()) {
23710 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23711 RefinementSteps = 0;
23712
23713 if (VT == MVT::f16) {
23715 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23716 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23717 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
23718 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23719 }
23720
23721 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
23722 }
23723 return SDValue();
23724}
23725
23726/// If we have at least two divisions that use the same divisor, convert to
23727/// multiplication by a reciprocal. This may need to be adjusted for a given
23728/// CPU if a division's cost is not at least twice the cost of a multiplication.
23729/// This is because we still need one division to calculate the reciprocal and
23730/// then we need two multiplies by that reciprocal as replacements for the
23731/// original divisions.
23733 return 2;
23734}
23735
23736SDValue
23737X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
23738 SelectionDAG &DAG,
23739 SmallVectorImpl<SDNode *> &Created) const {
23740 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
23741 if (isIntDivCheap(N->getValueType(0), Attr))
23742 return SDValue(N,0); // Lower SDIV as SDIV
23743
23744 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
23745 "Unexpected divisor!");
23746
23747 // Only perform this transform if CMOV is supported otherwise the select
23748 // below will become a branch.
23749 if (!Subtarget.canUseCMOV())
23750 return SDValue();
23751
23752 // fold (sdiv X, pow2)
23753 EVT VT = N->getValueType(0);
23754 // FIXME: Support i8.
23755 if (VT != MVT::i16 && VT != MVT::i32 &&
23756 !(Subtarget.is64Bit() && VT == MVT::i64))
23757 return SDValue();
23758
23759 // If the divisor is 2 or -2, the default expansion is better.
23760 if (Divisor == 2 ||
23761 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
23762 return SDValue();
23763
23764 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
23765}
23766
23767/// Result of 'and' is compared against zero. Change to a BT node if possible.
23768/// Returns the BT node and the condition code needed to use it.
23770 SelectionDAG &DAG, X86::CondCode &X86CC) {
23771 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
23772 SDValue Op0 = And.getOperand(0);
23773 SDValue Op1 = And.getOperand(1);
23774 if (Op0.getOpcode() == ISD::TRUNCATE)
23775 Op0 = Op0.getOperand(0);
23776 if (Op1.getOpcode() == ISD::TRUNCATE)
23777 Op1 = Op1.getOperand(0);
23778
23779 SDValue Src, BitNo;
23780 if (Op1.getOpcode() == ISD::SHL)
23781 std::swap(Op0, Op1);
23782 if (Op0.getOpcode() == ISD::SHL) {
23783 if (isOneConstant(Op0.getOperand(0))) {
23784 // If we looked past a truncate, check that it's only truncating away
23785 // known zeros.
23786 unsigned BitWidth = Op0.getValueSizeInBits();
23787 unsigned AndBitWidth = And.getValueSizeInBits();
23788 if (BitWidth > AndBitWidth) {
23789 KnownBits Known = DAG.computeKnownBits(Op0);
23790 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23791 return SDValue();
23792 }
23793 Src = Op1;
23794 BitNo = Op0.getOperand(1);
23795 }
23796 } else if (Op1.getOpcode() == ISD::Constant) {
23797 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
23798 uint64_t AndRHSVal = AndRHS->getZExtValue();
23799 SDValue AndLHS = Op0;
23800
23801 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
23802 Src = AndLHS.getOperand(0);
23803 BitNo = AndLHS.getOperand(1);
23804 } else {
23805 // Use BT if the immediate can't be encoded in a TEST instruction or we
23806 // are optimizing for size and the immedaite won't fit in a byte.
23807 bool OptForSize = DAG.shouldOptForSize();
23808 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
23809 isPowerOf2_64(AndRHSVal)) {
23810 Src = AndLHS;
23811 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
23812 Src.getValueType());
23813 }
23814 }
23815 }
23816
23817 // No patterns found, give up.
23818 if (!Src.getNode())
23819 return SDValue();
23820
23821 // Remove any bit flip.
23822 if (isBitwiseNot(Src)) {
23823 Src = Src.getOperand(0);
23824 CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
23825 }
23826
23827 // Attempt to create the X86ISD::BT node.
23828 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
23829 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23830 return BT;
23831 }
23832
23833 return SDValue();
23834}
23835
23836// Check if pre-AVX condcode can be performed by a single FCMP op.
23837static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
23838 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
23839}
23840
23841/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23842/// CMPs.
23843static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23844 SDValue &Op1, bool &IsAlwaysSignaling) {
23845 unsigned SSECC;
23846 bool Swap = false;
23847
23848 // SSE Condition code mapping:
23849 // 0 - EQ
23850 // 1 - LT
23851 // 2 - LE
23852 // 3 - UNORD
23853 // 4 - NEQ
23854 // 5 - NLT
23855 // 6 - NLE
23856 // 7 - ORD
23857 switch (SetCCOpcode) {
23858 // clang-format off
23859 default: llvm_unreachable("Unexpected SETCC condition");
23860 case ISD::SETOEQ:
23861 case ISD::SETEQ: SSECC = 0; break;
23862 case ISD::SETOGT:
23863 case ISD::SETGT: Swap = true; [[fallthrough]];
23864 case ISD::SETLT:
23865 case ISD::SETOLT: SSECC = 1; break;
23866 case ISD::SETOGE:
23867 case ISD::SETGE: Swap = true; [[fallthrough]];
23868 case ISD::SETLE:
23869 case ISD::SETOLE: SSECC = 2; break;
23870 case ISD::SETUO: SSECC = 3; break;
23871 case ISD::SETUNE:
23872 case ISD::SETNE: SSECC = 4; break;
23873 case ISD::SETULE: Swap = true; [[fallthrough]];
23874 case ISD::SETUGE: SSECC = 5; break;
23875 case ISD::SETULT: Swap = true; [[fallthrough]];
23876 case ISD::SETUGT: SSECC = 6; break;
23877 case ISD::SETO: SSECC = 7; break;
23878 case ISD::SETUEQ: SSECC = 8; break;
23879 case ISD::SETONE: SSECC = 12; break;
23880 // clang-format on
23881 }
23882 if (Swap)
23883 std::swap(Op0, Op1);
23884
23885 switch (SetCCOpcode) {
23886 default:
23887 IsAlwaysSignaling = true;
23888 break;
23889 case ISD::SETEQ:
23890 case ISD::SETOEQ:
23891 case ISD::SETUEQ:
23892 case ISD::SETNE:
23893 case ISD::SETONE:
23894 case ISD::SETUNE:
23895 case ISD::SETO:
23896 case ISD::SETUO:
23897 IsAlwaysSignaling = false;
23898 break;
23899 }
23900
23901 return SSECC;
23902}
23903
23904/// Break a VSETCC 256/512-bit vector into two new 128/256 ones and then
23905/// concatenate the result back.
23907 SelectionDAG &DAG, const SDLoc &dl) {
23908 assert(VT.isInteger() && LHS.getValueType() == RHS.getValueType() &&
23909 "Unsupported VTs!");
23910 SDValue CC = DAG.getCondCode(Cond);
23911
23912 // Extract the LHS Lo/Hi vectors
23913 SDValue LHS1, LHS2;
23914 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23915
23916 // Extract the RHS Lo/Hi vectors
23917 SDValue RHS1, RHS2;
23918 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23919
23920 // Issue the operation on the smaller types and concatenate the result back
23921 EVT LoVT, HiVT;
23922 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23923 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23924 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23925 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23926}
23927
23929 SelectionDAG &DAG) {
23930 SDValue Op0 = Op.getOperand(0);
23931 SDValue Op1 = Op.getOperand(1);
23932 SDValue CC = Op.getOperand(2);
23933 MVT VT = Op.getSimpleValueType();
23934 assert(VT.getVectorElementType() == MVT::i1 &&
23935 "Cannot set masked compare for this operation");
23936
23937 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23938
23939 // Prefer SETGT over SETLT.
23940 if (SetCCOpcode == ISD::SETLT) {
23941 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23942 std::swap(Op0, Op1);
23943 }
23944
23945 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23946}
23947
23948/// Given a buildvector constant, return a new vector constant with each element
23949/// incremented or decremented. If incrementing or decrementing would result in
23950/// unsigned overflow or underflow or this is not a simple vector constant,
23951/// return an empty value.
23953 bool NSW) {
23954 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23955 if (!BV || !V.getValueType().isSimple())
23956 return SDValue();
23957
23958 MVT VT = V.getSimpleValueType();
23959 MVT EltVT = VT.getVectorElementType();
23960 unsigned NumElts = VT.getVectorNumElements();
23962 SDLoc DL(V);
23963 for (unsigned i = 0; i < NumElts; ++i) {
23964 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23965 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23966 return SDValue();
23967
23968 // Avoid overflow/underflow.
23969 const APInt &EltC = Elt->getAPIntValue();
23970 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23971 return SDValue();
23972 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
23973 (!IsInc && EltC.isMinSignedValue())))
23974 return SDValue();
23975
23976 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23977 }
23978
23979 return DAG.getBuildVector(VT, DL, NewVecC);
23980}
23981
23982/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23983/// Op0 u<= Op1:
23984/// t = psubus Op0, Op1
23985/// pcmpeq t, <0..0>
23987 ISD::CondCode Cond, const SDLoc &dl,
23988 const X86Subtarget &Subtarget,
23989 SelectionDAG &DAG) {
23990 if (!Subtarget.hasSSE2())
23991 return SDValue();
23992
23993 MVT VET = VT.getVectorElementType();
23994 if (VET != MVT::i8 && VET != MVT::i16)
23995 return SDValue();
23996
23997 switch (Cond) {
23998 default:
23999 return SDValue();
24000 case ISD::SETULT: {
24001 // If the comparison is against a constant we can turn this into a
24002 // setule. With psubus, setule does not require a swap. This is
24003 // beneficial because the constant in the register is no longer
24004 // destructed as the destination so it can be hoisted out of a loop.
24005 // Only do this pre-AVX since vpcmp* is no longer destructive.
24006 if (Subtarget.hasAVX())
24007 return SDValue();
24008 SDValue ULEOp1 =
24009 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
24010 if (!ULEOp1)
24011 return SDValue();
24012 Op1 = ULEOp1;
24013 break;
24014 }
24015 case ISD::SETUGT: {
24016 // If the comparison is against a constant, we can turn this into a setuge.
24017 // This is beneficial because materializing a constant 0 for the PCMPEQ is
24018 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
24019 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
24020 SDValue UGEOp1 =
24021 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
24022 if (!UGEOp1)
24023 return SDValue();
24024 Op1 = Op0;
24025 Op0 = UGEOp1;
24026 break;
24027 }
24028 // Psubus is better than flip-sign because it requires no inversion.
24029 case ISD::SETUGE:
24030 std::swap(Op0, Op1);
24031 break;
24032 case ISD::SETULE:
24033 break;
24034 }
24035
24036 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
24037 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
24038 DAG.getConstant(0, dl, VT));
24039}
24040
24041static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
24042 SelectionDAG &DAG) {
24043 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24044 Op.getOpcode() == ISD::STRICT_FSETCCS;
24045 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24046 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24047 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
24048 MVT VT = Op->getSimpleValueType(0);
24050 MVT OpVT = Op0.getSimpleValueType();
24051 SDLoc dl(Op);
24052
24053 if (OpVT.isFloatingPoint()) {
24054 MVT EltVT = OpVT.getVectorElementType();
24055 assert(EltVT == MVT::bf16 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
24056 EltVT == MVT::f64);
24057
24058 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24059 if (isSoftF16(EltVT, Subtarget)) {
24060 if (Subtarget.hasAVX512() && !Subtarget.hasVLX())
24061 return SDValue();
24062
24063 // Break 256-bit FP vector compare into smaller ones.
24064 if (OpVT.is256BitVector() && !Subtarget.useAVX512Regs())
24065 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24066
24067 // Break 512-bit FP vector compare into smaller ones.
24068 if (OpVT.is512BitVector())
24069 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24070
24071 MVT NVT = OpVT.changeVectorElementType(MVT::f32);
24072 if (IsStrict) {
24073 Op0 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24074 {Chain, Op0});
24075 Op1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24076 {Chain, Op1});
24077 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
24078 {Chain, Op0, Op1, CC});
24079 }
24080 MVT DVT = VT.getVectorElementType() == MVT::i16
24081 ? VT.changeVectorElementType(MVT::i32)
24082 : VT;
24083 SDValue Cmp = DAG.getNode(Op.getOpcode(), dl, DVT,
24084 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op0),
24085 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op1), CC);
24086 return DVT == VT ? Cmp : DAG.getNode(ISD::TRUNCATE, dl, VT, Cmp);
24087 }
24088
24089 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24090
24091 // If we have a strict compare with a vXi1 result and the input is 128/256
24092 // bits we can't use a masked compare unless we have VLX. If we use a wider
24093 // compare like we do for non-strict, we might trigger spurious exceptions
24094 // from the upper elements. Instead emit a AVX compare and convert to mask.
24095 unsigned Opc;
24096 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
24097 (!IsStrict || Subtarget.hasVLX() ||
24099#ifndef NDEBUG
24100 unsigned Num = VT.getVectorNumElements();
24101 assert(Num <= 16 ||
24102 (Num == 32 && (EltVT == MVT::f16 || EltVT == MVT::bf16)));
24103#endif
24104 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
24105 } else {
24106 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
24107 // The SSE/AVX packed FP comparison nodes are defined with a
24108 // floating-point vector result that matches the operand type. This allows
24109 // them to work with an SSE1 target (integer vector types are not legal).
24110 VT = Op0.getSimpleValueType();
24111 }
24112
24113 SDValue Cmp;
24114 bool IsAlwaysSignaling;
24115 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
24116 if (!Subtarget.hasAVX()) {
24117 // TODO: We could use following steps to handle a quiet compare with
24118 // signaling encodings.
24119 // 1. Get ordered masks from a quiet ISD::SETO
24120 // 2. Use the masks to mask potential unordered elements in operand A, B
24121 // 3. Get the compare results of masked A, B
24122 // 4. Calculating final result using the mask and result from 3
24123 // But currently, we just fall back to scalar operations.
24124 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
24125 return SDValue();
24126
24127 // Insert an extra signaling instruction to raise exception.
24128 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
24129 SDValue SignalCmp = DAG.getNode(
24130 Opc, dl, {VT, MVT::Other},
24131 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
24132 // FIXME: It seems we need to update the flags of all new strict nodes.
24133 // Otherwise, mayRaiseFPException in MI will return false due to
24134 // NoFPExcept = false by default. However, I didn't find it in other
24135 // patches.
24136 SignalCmp->setFlags(Op->getFlags());
24137 Chain = SignalCmp.getValue(1);
24138 }
24139
24140 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
24141 // emit two comparisons and a logic op to tie them together.
24142 if (!cheapX86FSETCC_SSE(Cond)) {
24143 // LLVM predicate is SETUEQ or SETONE.
24144 unsigned CC0, CC1;
24145 unsigned CombineOpc;
24146 if (Cond == ISD::SETUEQ) {
24147 CC0 = 3; // UNORD
24148 CC1 = 0; // EQ
24149 CombineOpc = X86ISD::FOR;
24150 } else {
24152 CC0 = 7; // ORD
24153 CC1 = 4; // NEQ
24154 CombineOpc = X86ISD::FAND;
24155 }
24156
24157 SDValue Cmp0, Cmp1;
24158 if (IsStrict) {
24159 Cmp0 = DAG.getNode(
24160 Opc, dl, {VT, MVT::Other},
24161 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
24162 Cmp1 = DAG.getNode(
24163 Opc, dl, {VT, MVT::Other},
24164 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
24165 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
24166 Cmp1.getValue(1));
24167 } else {
24168 Cmp0 = DAG.getNode(
24169 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
24170 Cmp1 = DAG.getNode(
24171 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
24172 }
24173 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
24174 } else {
24175 if (IsStrict) {
24176 Cmp = DAG.getNode(
24177 Opc, dl, {VT, MVT::Other},
24178 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24179 Chain = Cmp.getValue(1);
24180 } else
24181 Cmp = DAG.getNode(
24182 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24183 }
24184 } else {
24185 // Handle all other FP comparisons here.
24186 if (IsStrict) {
24187 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
24188 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
24189 Cmp = DAG.getNode(
24190 Opc, dl, {VT, MVT::Other},
24191 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24192 Chain = Cmp.getValue(1);
24193 } else
24194 Cmp = DAG.getNode(
24195 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24196 }
24197
24198 if (VT.getFixedSizeInBits() >
24199 Op.getSimpleValueType().getFixedSizeInBits()) {
24200 // We emitted a compare with an XMM/YMM result. Finish converting to a
24201 // mask register using a vptestm.
24203 Cmp = DAG.getBitcast(CastVT, Cmp);
24204 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
24205 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
24206 } else {
24207 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
24208 // the result type of SETCC. The bitcast is expected to be optimized
24209 // away during combining/isel.
24210 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
24211 }
24212
24213 if (IsStrict)
24214 return DAG.getMergeValues({Cmp, Chain}, dl);
24215
24216 return Cmp;
24217 }
24218
24219 assert(!IsStrict && "Strict SETCC only handles FP operands.");
24220
24221 [[maybe_unused]] MVT VTOp0 = Op0.getSimpleValueType();
24222 assert(VTOp0 == Op1.getSimpleValueType() &&
24223 "Expected operands with same type!");
24225 "Invalid number of packed elements for source and destination!");
24226
24227 // The non-AVX512 code below works under the assumption that source and
24228 // destination types are the same.
24229 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
24230 "Value types for source and destination must be the same!");
24231
24232 // The result is boolean, but operands are int/float
24233 if (VT.getVectorElementType() == MVT::i1) {
24234 // In AVX-512 architecture setcc returns mask with i1 elements,
24235 // But there is no compare instruction for i8 and i16 elements in KNL.
24236 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
24237 "Unexpected operand type");
24238 return LowerIntVSETCC_AVX512(Op, dl, DAG);
24239 }
24240
24241 // Lower using XOP integer comparisons.
24242 if (VT.is128BitVector() && Subtarget.hasXOP()) {
24243 // Translate compare code to XOP PCOM compare mode.
24244 unsigned CmpMode = 0;
24245 switch (Cond) {
24246 // clang-format off
24247 default: llvm_unreachable("Unexpected SETCC condition");
24248 case ISD::SETULT:
24249 case ISD::SETLT: CmpMode = 0x00; break;
24250 case ISD::SETULE:
24251 case ISD::SETLE: CmpMode = 0x01; break;
24252 case ISD::SETUGT:
24253 case ISD::SETGT: CmpMode = 0x02; break;
24254 case ISD::SETUGE:
24255 case ISD::SETGE: CmpMode = 0x03; break;
24256 case ISD::SETEQ: CmpMode = 0x04; break;
24257 case ISD::SETNE: CmpMode = 0x05; break;
24258 // clang-format on
24259 }
24260
24261 // Are we comparing unsigned or signed integers?
24262 unsigned Opc =
24264
24265 return DAG.getNode(Opc, dl, VT, Op0, Op1,
24266 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
24267 }
24268
24269 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
24270 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
24272 SDValue BC0 = peekThroughBitcasts(Op0);
24273 if (BC0.getOpcode() == ISD::AND &&
24275 /*AllowUndefs=*/false)) {
24276 Cond = ISD::SETEQ;
24277 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
24278 }
24279 }
24280
24281 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
24282 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
24283 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
24285 if (C1 && C1->getAPIntValue().isPowerOf2()) {
24286 unsigned BitWidth = VT.getScalarSizeInBits();
24287 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
24288
24289 SDValue Result = Op0.getOperand(0);
24290 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
24291 DAG.getConstant(ShiftAmt, dl, VT));
24292 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
24293 DAG.getConstant(BitWidth - 1, dl, VT));
24294 return Result;
24295 }
24296 }
24297
24298 // Break 256-bit integer vector compare into smaller ones.
24299 if (VT.is256BitVector() && !Subtarget.hasInt256())
24300 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24301
24302 // Break 512-bit integer vector compare into smaller ones.
24303 // TODO: Try harder to use VPCMPx + VPMOV2x?
24304 if (VT.is512BitVector())
24305 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24306
24307 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
24308 // not-of-PCMPEQ:
24309 // X != INT_MIN --> X >s INT_MIN
24310 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
24311 // +X != 0 --> +X >s 0
24312 APInt ConstValue;
24313 if (Cond == ISD::SETNE &&
24314 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
24315 if (ConstValue.isMinSignedValue())
24316 Cond = ISD::SETGT;
24317 else if (ConstValue.isMaxSignedValue())
24318 Cond = ISD::SETLT;
24319 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
24320 Cond = ISD::SETGT;
24321 }
24322
24323 // If both operands are known non-negative, then an unsigned compare is the
24324 // same as a signed compare and there's no need to flip signbits.
24325 // TODO: We could check for more general simplifications here since we're
24326 // computing known bits.
24327 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
24328 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
24329
24330 // Special case: Use min/max operations for unsigned compares.
24331 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24333 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
24334 TLI.isOperationLegal(ISD::UMIN, VT)) {
24335 // If we have a constant operand, increment/decrement it and change the
24336 // condition to avoid an invert.
24337 if (Cond == ISD::SETUGT) {
24338 // X > C --> X >= (C+1) --> X == umax(X, C+1)
24339 if (SDValue UGTOp1 =
24340 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
24341 Op1 = UGTOp1;
24342 Cond = ISD::SETUGE;
24343 }
24344 }
24345 if (Cond == ISD::SETULT) {
24346 // X < C --> X <= (C-1) --> X == umin(X, C-1)
24347 if (SDValue ULTOp1 =
24348 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
24349 Op1 = ULTOp1;
24350 Cond = ISD::SETULE;
24351 }
24352 }
24353 bool Invert = false;
24354 unsigned Opc;
24355 switch (Cond) {
24356 // clang-format off
24357 default: llvm_unreachable("Unexpected condition code");
24358 case ISD::SETUGT: Invert = true; [[fallthrough]];
24359 case ISD::SETULE: Opc = ISD::UMIN; break;
24360 case ISD::SETULT: Invert = true; [[fallthrough]];
24361 case ISD::SETUGE: Opc = ISD::UMAX; break;
24362 // clang-format on
24363 }
24364
24365 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24366 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
24367
24368 // If the logical-not of the result is required, perform that now.
24369 if (Invert)
24370 Result = DAG.getNOT(dl, Result, VT);
24371
24372 return Result;
24373 }
24374
24375 // Try to use SUBUS and PCMPEQ.
24376 if (FlipSigns)
24377 if (SDValue V =
24378 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
24379 return V;
24380
24381 // We are handling one of the integer comparisons here. Since SSE only has
24382 // GT and EQ comparisons for integer, swapping operands and multiple
24383 // operations may be required for some comparisons.
24384 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
24386 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
24388 bool Invert = Cond == ISD::SETNE ||
24390
24391 if (Swap)
24392 std::swap(Op0, Op1);
24393
24394 // Check that the operation in question is available (most are plain SSE2,
24395 // but PCMPGTQ and PCMPEQQ have different requirements).
24396 if (VT == MVT::v2i64) {
24397 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
24398 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
24399
24400 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
24401 // the odd elements over the even elements.
24402 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
24403 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
24404 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24405
24406 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24407 static const int MaskHi[] = { 1, 1, 3, 3 };
24408 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24409
24410 return DAG.getBitcast(VT, Result);
24411 }
24412
24413 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
24414 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24415 Op1 = DAG.getAllOnesConstant(dl, MVT::v4i32);
24416
24417 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24418 static const int MaskHi[] = { 1, 1, 3, 3 };
24419 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24420
24421 return DAG.getBitcast(VT, Result);
24422 }
24423
24424 // If the i64 elements are sign-extended enough to be representable as i32
24425 // then we can compare the lower i32 bits and splat.
24426 if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 &&
24427 DAG.ComputeNumSignBits(Op1) > 32) {
24428 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24429 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24430
24431 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24432 static const int MaskLo[] = {0, 0, 2, 2};
24433 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24434
24435 return DAG.getBitcast(VT, Result);
24436 }
24437
24438 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24439 // bits of the inputs before performing those operations. The lower
24440 // compare is always unsigned.
24441 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
24442 : 0x0000000080000000ULL,
24443 dl, MVT::v2i64);
24444
24445 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
24446 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
24447
24448 // Cast everything to the right type.
24449 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24450 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24451
24452 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
24453 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24454 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
24455
24456 // Create masks for only the low parts/high parts of the 64 bit integers.
24457 static const int MaskHi[] = { 1, 1, 3, 3 };
24458 static const int MaskLo[] = { 0, 0, 2, 2 };
24459 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
24460 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24461 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24462
24463 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
24464 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
24465
24466 if (Invert)
24467 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24468
24469 return DAG.getBitcast(VT, Result);
24470 }
24471
24472 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
24473 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
24474 // pcmpeqd + pshufd + pand.
24475 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
24476
24477 // First cast everything to the right type.
24478 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24479 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24480
24481 // Do the compare.
24482 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
24483
24484 // Make sure the lower and upper halves are both all-ones.
24485 static const int Mask[] = { 1, 0, 3, 2 };
24486 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
24487 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
24488
24489 if (Invert)
24490 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24491
24492 return DAG.getBitcast(VT, Result);
24493 }
24494 }
24495
24496 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24497 // bits of the inputs before performing those operations.
24498 if (FlipSigns) {
24499 MVT EltVT = VT.getVectorElementType();
24501 VT);
24502 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
24503 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
24504 }
24505
24506 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24507
24508 // If the logical-not of the result is required, perform that now.
24509 if (Invert)
24510 Result = DAG.getNOT(dl, Result, VT);
24511
24512 return Result;
24513}
24514
24515// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
24517 const SDLoc &dl, SelectionDAG &DAG,
24518 const X86Subtarget &Subtarget,
24519 SDValue &X86CC) {
24520 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
24521
24522 // Must be a bitcast from vXi1.
24523 if (Op0.getOpcode() != ISD::BITCAST)
24524 return SDValue();
24525
24526 Op0 = Op0.getOperand(0);
24527 MVT VT = Op0.getSimpleValueType();
24528 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
24529 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
24530 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
24531 return SDValue();
24532
24533 X86::CondCode X86Cond;
24534 if (isNullConstant(Op1)) {
24535 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
24536 } else if (isAllOnesConstant(Op1)) {
24537 // C flag is set for all ones.
24538 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
24539 } else
24540 return SDValue();
24541
24542 // If the input is an AND, we can combine it's operands into the KTEST.
24543 bool KTestable = false;
24544 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
24545 KTestable = true;
24546 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
24547 KTestable = true;
24548 if (!isNullConstant(Op1))
24549 KTestable = false;
24550 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
24551 SDValue LHS = Op0.getOperand(0);
24552 SDValue RHS = Op0.getOperand(1);
24553 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24554 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
24555 }
24556
24557 // If the input is an OR, we can combine it's operands into the KORTEST.
24558 SDValue LHS = Op0;
24559 SDValue RHS = Op0;
24560 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
24561 LHS = Op0.getOperand(0);
24562 RHS = Op0.getOperand(1);
24563 }
24564
24565 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24566 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
24567}
24568
24569/// Emit flags for the given setcc condition and operands. Also returns the
24570/// corresponding X86 condition code constant in X86CC.
24571SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
24572 ISD::CondCode CC, const SDLoc &dl,
24573 SelectionDAG &DAG,
24574 SDValue &X86CC) const {
24575 // Equality Combines.
24576 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
24577 X86::CondCode X86CondCode;
24578
24579 // Optimize to BT if possible.
24580 // Lower (X & (1 << N)) == 0 to BT(X, N).
24581 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
24582 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
24583 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
24584 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
24585 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24586 return BT;
24587 }
24588 }
24589
24590 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
24591 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
24592 X86CondCode)) {
24593 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24594 return CmpZ;
24595 }
24596
24597 // Try to lower using KORTEST or KTEST.
24598 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
24599 return Test;
24600
24601 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
24602 // of these.
24603 if (isOneConstant(Op1) || isNullConstant(Op1)) {
24604 // If the input is a setcc, then reuse the input setcc or use a new one
24605 // with the inverted condition.
24606 if (Op0.getOpcode() == X86ISD::SETCC) {
24607 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
24608
24609 X86CC = Op0.getOperand(0);
24610 if (Invert) {
24611 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
24612 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
24613 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24614 }
24615
24616 return Op0.getOperand(1);
24617 }
24618 }
24619
24620 // Look for X == INT_MIN or X != INT_MIN. We can use NEG and test for
24621 // overflow.
24622 if (isMinSignedConstant(Op1)) {
24623 EVT VT = Op0.getValueType();
24624 if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) {
24625 SDVTList CmpVTs = DAG.getVTList(VT, MVT::i32);
24627 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24628 SDValue Neg = DAG.getNode(X86ISD::SUB, dl, CmpVTs,
24629 DAG.getConstant(0, dl, VT), Op0);
24630 return SDValue(Neg.getNode(), 1);
24631 }
24632 }
24633
24634 // Try to use the carry flag from the add in place of an separate CMP for:
24635 // (seteq (add X, -1), -1). Similar for setne.
24636 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
24637 Op0.getOperand(1) == Op1) {
24638 if (isProfitableToUseFlagOp(Op0)) {
24639 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
24640
24641 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
24642 Op0.getOperand(1));
24643 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
24644 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
24645 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24646 return SDValue(New.getNode(), 1);
24647 }
24648 }
24649 }
24650
24652 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
24653 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
24654
24655 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
24656 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24657 return EFLAGS;
24658}
24659
24660SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
24661
24662 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24663 Op.getOpcode() == ISD::STRICT_FSETCCS;
24664 MVT VT = Op->getSimpleValueType(0);
24665
24666 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
24667
24668 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
24669 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24670 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24671 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24672 SDLoc dl(Op);
24673 ISD::CondCode CC =
24674 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
24675
24676 if (isSoftF16(Op0.getValueType(), Subtarget))
24677 return SDValue();
24678
24679 // Handle f128 first, since one possible outcome is a normal integer
24680 // comparison which gets handled by emitFlagsForSetcc.
24681 if (Op0.getValueType() == MVT::f128) {
24682 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
24683 Op.getOpcode() == ISD::STRICT_FSETCCS);
24684
24685 // If softenSetCCOperands returned a scalar, use it.
24686 if (!Op1.getNode()) {
24687 assert(Op0.getValueType() == Op.getValueType() &&
24688 "Unexpected setcc expansion!");
24689 if (IsStrict)
24690 return DAG.getMergeValues({Op0, Chain}, dl);
24691 return Op0;
24692 }
24693 }
24694
24695 if (Op0.getSimpleValueType().isInteger()) {
24696 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
24697 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
24698 // this may translate to less uops depending on uarch implementation. The
24699 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
24700 // canonicalize to that CondCode.
24701 // NOTE: Only do this if incrementing the constant doesn't increase the bit
24702 // encoding size - so it must either already be a i8 or i32 immediate, or it
24703 // shrinks down to that. We don't do this for any i64's to avoid additional
24704 // constant materializations.
24705 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
24706 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
24707 const APInt &Op1Val = Op1C->getAPIntValue();
24708 if (!Op1Val.isZero()) {
24709 // Ensure the constant+1 doesn't overflow.
24710 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
24711 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
24712 APInt Op1ValPlusOne = Op1Val + 1;
24713 if (Op1ValPlusOne.isSignedIntN(32) &&
24714 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
24715 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
24718 }
24719 }
24720 }
24721 }
24722
24723 SDValue X86CC;
24724 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
24725 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24726 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24727 }
24728
24729 if (Subtarget.hasAVX10_2()) {
24730 if (CC == ISD::SETOEQ || CC == ISD::SETUNE) {
24731 auto NewCC = (CC == ISD::SETOEQ) ? X86::COND_E : (X86::COND_NE);
24732 assert(Op0.getSimpleValueType() != MVT::bf16 && "Unsupported Type");
24733 if (Op0.getSimpleValueType() != MVT::f80) {
24734 SDValue Res = getSETCC(
24735 NewCC, DAG.getNode(X86ISD::UCOMX, dl, MVT::i32, Op0, Op1), dl, DAG);
24736 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24737 }
24738 }
24739 }
24740 // Handle floating point.
24741 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
24742 if (CondCode == X86::COND_INVALID)
24743 return SDValue();
24744
24745 SDValue EFLAGS;
24746 if (IsStrict) {
24747 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24748 EFLAGS =
24750 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
24751 Chain = EFLAGS.getValue(1);
24752 } else {
24753 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
24754 }
24755
24756 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24757 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24758 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24759}
24760
24761SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
24762 SDValue LHS = Op.getOperand(0);
24763 SDValue RHS = Op.getOperand(1);
24764 SDValue Carry = Op.getOperand(2);
24765 SDValue Cond = Op.getOperand(3);
24766 SDLoc DL(Op);
24767
24768 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
24770
24771 // Recreate the carry if needed.
24772 EVT CarryVT = Carry.getValueType();
24773 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24774 Carry, DAG.getAllOnesConstant(DL, CarryVT));
24775
24776 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
24777 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
24778 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
24779}
24780
24781// This function returns three things: the arithmetic computation itself
24782// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
24783// flag and the condition code define the case in which the arithmetic
24784// computation overflows.
24785static std::pair<SDValue, SDValue>
24787 assert(Op.getResNo() == 0 && "Unexpected result number!");
24788 SDValue Value, Overflow;
24789 SDValue LHS = Op.getOperand(0);
24790 SDValue RHS = Op.getOperand(1);
24791 unsigned BaseOp = 0;
24792 SDLoc DL(Op);
24793 switch (Op.getOpcode()) {
24794 default: llvm_unreachable("Unknown ovf instruction!");
24795 case ISD::SADDO:
24796 BaseOp = X86ISD::ADD;
24797 Cond = X86::COND_O;
24798 break;
24799 case ISD::UADDO:
24800 BaseOp = X86ISD::ADD;
24802 break;
24803 case ISD::SSUBO:
24804 BaseOp = X86ISD::SUB;
24805 Cond = X86::COND_O;
24806 break;
24807 case ISD::USUBO:
24808 BaseOp = X86ISD::SUB;
24809 Cond = X86::COND_B;
24810 break;
24811 case ISD::SMULO:
24812 BaseOp = X86ISD::SMUL;
24813 Cond = X86::COND_O;
24814 break;
24815 case ISD::UMULO:
24816 BaseOp = X86ISD::UMUL;
24817 Cond = X86::COND_O;
24818 break;
24819 }
24820
24821 if (BaseOp) {
24822 // Also sets EFLAGS.
24823 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24824 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24825 Overflow = Value.getValue(1);
24826 }
24827
24828 return std::make_pair(Value, Overflow);
24829}
24830
24832 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24833 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24834 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24835 // has only one use.
24836 SDLoc DL(Op);
24838 SDValue Value, Overflow;
24839 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
24840
24841 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
24842 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
24843 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24844}
24845
24846/// Return true if opcode is a X86 logical comparison.
24848 unsigned Opc = Op.getOpcode();
24849 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
24850 Opc == X86ISD::FCMP)
24851 return true;
24852 if (Op.getResNo() == 1 &&
24853 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
24855 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
24856 return true;
24857
24858 return false;
24859}
24860
24862 if (V.getOpcode() != ISD::TRUNCATE)
24863 return false;
24864
24865 SDValue VOp0 = V.getOperand(0);
24866 unsigned InBits = VOp0.getValueSizeInBits();
24867 unsigned Bits = V.getValueSizeInBits();
24868 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24869}
24870
24871// Lower various (select (icmp CmpVal, 0), LHS, RHS) custom patterns.
24873 unsigned X86CC, const SDLoc &DL,
24874 SelectionDAG &DAG,
24875 const X86Subtarget &Subtarget) {
24876 EVT CmpVT = CmpVal.getValueType();
24877 EVT VT = LHS.getValueType();
24878 if (!CmpVT.isScalarInteger() || !VT.isScalarInteger())
24879 return SDValue();
24880
24881 if (X86CC == X86::COND_E && CmpVal.getOpcode() == ISD::AND &&
24882 isOneConstant(CmpVal.getOperand(1))) {
24883 auto SplatLSB = [&](EVT SplatVT) {
24884 // we need mask of all zeros or ones with same size of the other
24885 // operands.
24886 SDValue Neg = CmpVal;
24887 if (CmpVT.bitsGT(SplatVT))
24888 Neg = DAG.getNode(ISD::TRUNCATE, DL, SplatVT, CmpVal);
24889 else if (CmpVT.bitsLT(SplatVT))
24890 Neg = DAG.getNode(
24891 ISD::AND, DL, SplatVT,
24892 DAG.getNode(ISD::ANY_EXTEND, DL, SplatVT, CmpVal.getOperand(0)),
24893 DAG.getConstant(1, DL, SplatVT));
24894 return DAG.getNegative(Neg, DL, SplatVT); // -(and (x, 0x1))
24895 };
24896
24897 // SELECT (AND(X,1) == 0), 0, -1 -> NEG(AND(X,1))
24899 return SplatLSB(VT);
24900
24901 // SELECT (AND(X,1) == 0), C1, C2 -> XOR(C1,AND(NEG(AND(X,1)),XOR(C1,C2))
24902 if (!Subtarget.canUseCMOV() && isa<ConstantSDNode>(LHS) &&
24904 SDValue Mask = SplatLSB(VT);
24905 SDValue Diff = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
24906 SDValue Flip = DAG.getNode(ISD::AND, DL, VT, Mask, Diff);
24907 return DAG.getNode(ISD::XOR, DL, VT, LHS, Flip);
24908 }
24909
24910 SDValue Src1, Src2;
24911 auto isIdentityPatternZero = [&]() {
24912 switch (RHS.getOpcode()) {
24913 default:
24914 break;
24915 case ISD::OR:
24916 case ISD::XOR:
24917 case ISD::ADD:
24918 if (RHS.getOperand(0) == LHS || RHS.getOperand(1) == LHS) {
24919 Src1 = RHS.getOperand(RHS.getOperand(0) == LHS ? 1 : 0);
24920 Src2 = LHS;
24921 return true;
24922 }
24923 break;
24924 case ISD::SHL:
24925 case ISD::SRA:
24926 case ISD::SRL:
24927 case ISD::SUB:
24928 if (RHS.getOperand(0) == LHS) {
24929 Src1 = RHS.getOperand(1);
24930 Src2 = LHS;
24931 return true;
24932 }
24933 break;
24934 }
24935 return false;
24936 };
24937
24938 auto isIdentityPatternOnes = [&]() {
24939 switch (LHS.getOpcode()) {
24940 default:
24941 break;
24942 case ISD::AND:
24943 if (LHS.getOperand(0) == RHS || LHS.getOperand(1) == RHS) {
24944 Src1 = LHS.getOperand(LHS.getOperand(0) == RHS ? 1 : 0);
24945 Src2 = RHS;
24946 return true;
24947 }
24948 break;
24949 }
24950 return false;
24951 };
24952
24953 // Convert 'identity' patterns (iff X is 0 or 1):
24954 // SELECT (AND(X,1) == 0), Y, (OR Y, Z) -> (OR Y, (AND NEG(AND(X,1)), Z))
24955 // SELECT (AND(X,1) == 0), Y, (XOR Y, Z) -> (XOR Y, (AND NEG(AND(X,1)), Z))
24956 // SELECT (AND(X,1) == 0), Y, (ADD Y, Z) -> (ADD Y, (AND NEG(AND(X,1)), Z))
24957 // SELECT (AND(X,1) == 0), Y, (SUB Y, Z) -> (SUB Y, (AND NEG(AND(X,1)), Z))
24958 // SELECT (AND(X,1) == 0), Y, (SHL Y, Z) -> (SHL Y, (AND NEG(AND(X,1)), Z))
24959 // SELECT (AND(X,1) == 0), Y, (SRA Y, Z) -> (SRA Y, (AND NEG(AND(X,1)), Z))
24960 // SELECT (AND(X,1) == 0), Y, (SRL Y, Z) -> (SRL Y, (AND NEG(AND(X,1)), Z))
24961 if (!Subtarget.canUseCMOV() && isIdentityPatternZero()) {
24962 SDValue Mask = SplatLSB(Src1.getValueType());
24963 SDValue And = DAG.getNode(ISD::AND, DL, Src1.getValueType(), Mask,
24964 Src1); // Mask & z
24965 return DAG.getNode(RHS.getOpcode(), DL, VT, Src2, And); // y Op And
24966 }
24967 // SELECT (AND(X,1) == 0), (AND Y, Z), Y -> (AND Y, (OR NEG(AND(X, 1)), Z))
24968 if (!Subtarget.canUseCMOV() && isIdentityPatternOnes()) {
24969 SDValue Mask = SplatLSB(VT);
24970 SDValue Or = DAG.getNode(ISD::OR, DL, VT, Mask, Src1); // Mask | z
24971 return DAG.getNode(LHS.getOpcode(), DL, VT, Src2, Or); // y Op Or
24972 }
24973 }
24974
24975 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
24978 SDVTList CmpVTs = DAG.getVTList(CmpVT, MVT::i32);
24979
24980 // 'X - 1' sets the carry flag if X == 0.
24981 // '0 - X' sets the carry flag if X != 0.
24982 // Convert the carry flag to a -1/0 mask with sbb:
24983 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24984 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24985 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24986 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24987 SDValue Sub;
24988 if (isAllOnesConstant(LHS) == (X86CC == X86::COND_NE)) {
24989 SDValue Zero = DAG.getConstant(0, DL, CmpVT);
24990 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpVal);
24991 } else {
24992 SDValue One = DAG.getConstant(1, DL, CmpVT);
24993 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpVal, One);
24994 }
24995 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
24996 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
24997 Sub.getValue(1));
24998 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
24999 }
25000
25001 return SDValue();
25002}
25003
25004SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
25005 bool AddTest = true;
25006 SDValue Cond = Op.getOperand(0);
25007 SDValue Op1 = Op.getOperand(1);
25008 SDValue Op2 = Op.getOperand(2);
25009 SDLoc DL(Op);
25010 MVT VT = Op1.getSimpleValueType();
25011 SDValue CC;
25012
25013 if (isSoftF16(VT, Subtarget)) {
25014 MVT NVT = VT.changeTypeToInteger();
25015 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
25016 DAG.getBitcast(NVT, Op1),
25017 DAG.getBitcast(NVT, Op2)));
25018 }
25019
25020 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
25021 // are available or VBLENDV if AVX is available.
25022 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
25023 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
25024 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
25025 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
25026 bool IsAlwaysSignaling;
25027 unsigned SSECC =
25028 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
25029 CondOp0, CondOp1, IsAlwaysSignaling);
25030
25031 if (Subtarget.hasAVX512()) {
25032 SDValue Cmp =
25033 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
25034 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25035 assert(!VT.isVector() && "Not a scalar type?");
25036 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25037 }
25038
25039 if (SSECC < 8 || Subtarget.hasAVX()) {
25040 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
25041 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25042
25043 // If we have SSE41/AVX, we can use a variable vector select (VBLENDV)
25044 // instead of 3 logic instructions for size savings and potentially speed.
25045 // Unfortunately, there is no scalar form of VBLENDV.
25046 //
25047 // If either operand is a +0.0 constant, don't try this. We can expect to
25048 // optimize away at least one of the logic instructions later in that
25049 // case, so that sequence would be faster than a variable blend.
25050 if (Subtarget.hasSSE41() && !isNullFPConstant(Op1) &&
25051 !isNullFPConstant(Op2)) {
25052 // Convert to vectors, do a VSELECT, and convert back to scalar.
25053 // All of the conversions should be optimized away.
25054 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
25055 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
25056 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
25057 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
25058
25059 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
25060 VCmp = DAG.getBitcast(VCmpVT, VCmp);
25061
25062 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
25063
25064 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel,
25065 DAG.getVectorIdxConstant(0, DL));
25066 }
25067 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
25068 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
25069 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
25070 }
25071 }
25072
25073 // AVX512 fallback is to lower selects of scalar floats to masked moves.
25074 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
25075 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
25076 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25077 }
25078
25079 if (Cond.getOpcode() == ISD::SETCC &&
25080 !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
25081 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
25082 Cond = NewCond;
25083 // If the condition was updated, it's possible that the operands of the
25084 // select were also updated (for example, EmitTest has a RAUW). Refresh
25085 // the local references to the select operands in case they got stale.
25086 Op1 = Op.getOperand(1);
25087 Op2 = Op.getOperand(2);
25088 }
25089 }
25090
25091 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
25092 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
25093 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
25094 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
25095 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
25096 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
25097 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25098 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25099 if (Cond.getOpcode() == X86ISD::SETCC &&
25100 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
25101 isNullConstant(Cond.getOperand(1).getOperand(1))) {
25102 SDValue Cmp = Cond.getOperand(1);
25103 SDValue CmpOp0 = Cmp.getOperand(0);
25104 unsigned CondCode = Cond.getConstantOperandVal(0);
25105
25106 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
25107 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
25108 // handle to keep the CMP with 0. This should be removed by
25109 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
25110 // cttz_zero_undef.
25111 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
25112 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
25113 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
25114 };
25115 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
25116 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
25117 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
25118 // Keep Cmp.
25119 } else if (SDValue R = LowerSELECTWithCmpZero(CmpOp0, Op1, Op2, CondCode,
25120 DL, DAG, Subtarget)) {
25121 return R;
25122 } else if (VT.isScalarInteger() && isNullConstant(Op2) &&
25123 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
25124 ((CondCode == X86::COND_S) || // smin(x, 0)
25125 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
25126 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25127 //
25128 // If the comparison is testing for a positive value, we have to invert
25129 // the sign bit mask, so only do that transform if the target has a
25130 // bitwise 'and not' instruction (the invert is free).
25131 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25132 unsigned ShCt = VT.getSizeInBits() - 1;
25133 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
25134 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
25135 if (CondCode == X86::COND_G)
25136 Shift = DAG.getNOT(DL, Shift, VT);
25137 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
25138 }
25139 }
25140
25141 // Look past (and (setcc_carry (cmp ...)), 1).
25142 if (Cond.getOpcode() == ISD::AND &&
25143 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
25144 isOneConstant(Cond.getOperand(1)))
25145 Cond = Cond.getOperand(0);
25146
25147 // Attempt to fold "raw cond" cases by treating them as:
25148 // (select (and X, 1), Op1, Op2 --> (select (icmpeq (and X, 1), 0), Op2, Op1)
25149 if (Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1)))
25150 if (SDValue R = LowerSELECTWithCmpZero(Cond, Op2, Op1, X86::COND_E, DL, DAG,
25151 Subtarget))
25152 return R;
25153
25154 // If condition flag is set by a X86ISD::CMP, then use it as the condition
25155 // setting operand in place of the X86ISD::SETCC.
25156 unsigned CondOpcode = Cond.getOpcode();
25157 if (CondOpcode == X86ISD::SETCC ||
25158 CondOpcode == X86ISD::SETCC_CARRY) {
25159 CC = Cond.getOperand(0);
25160
25161 SDValue Cmp = Cond.getOperand(1);
25162 bool IllegalFPCMov = false;
25163 if (VT.isFloatingPoint() && !VT.isVector() &&
25164 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
25165 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
25166
25167 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
25168 Cmp.getOpcode() == X86ISD::BT) { // FIXME
25169 Cond = Cmp;
25170 AddTest = false;
25171 }
25172 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
25173 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
25174 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
25175 SDValue Value;
25176 X86::CondCode X86Cond;
25177 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25178
25179 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
25180 AddTest = false;
25181 }
25182
25183 if (AddTest) {
25184 // Look past the truncate if the high bits are known zero.
25186 Cond = Cond.getOperand(0);
25187
25188 // We know the result of AND is compared against zero. Try to match
25189 // it to BT.
25190 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
25191 X86::CondCode X86CondCode;
25192 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
25193 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
25194 Cond = BT;
25195 AddTest = false;
25196 }
25197 }
25198 }
25199
25200 if (AddTest) {
25201 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
25202 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
25203 }
25204
25205 // a < b ? -1 : 0 -> RES = ~setcc_carry
25206 // a < b ? 0 : -1 -> RES = setcc_carry
25207 // a >= b ? -1 : 0 -> RES = setcc_carry
25208 // a >= b ? 0 : -1 -> RES = ~setcc_carry
25209 if (Cond.getOpcode() == X86ISD::SUB) {
25210 unsigned CondCode = CC->getAsZExtVal();
25211
25212 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
25213 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
25214 (isNullConstant(Op1) || isNullConstant(Op2))) {
25215 SDValue Res =
25216 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
25217 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
25218 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
25219 return DAG.getNOT(DL, Res, Res.getValueType());
25220 return Res;
25221 }
25222 }
25223
25224 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
25225 // widen the cmov and push the truncate through. This avoids introducing a new
25226 // branch during isel and doesn't add any extensions.
25227 if (Op.getValueType() == MVT::i8 &&
25228 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
25229 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
25230 if (T1.getValueType() == T2.getValueType() &&
25231 // Exclude CopyFromReg to avoid partial register stalls.
25232 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
25233 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
25234 CC, Cond);
25235 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25236 }
25237 }
25238
25239 // Or finally, promote i8 cmovs if we have CMOV,
25240 // or i16 cmovs if it won't prevent folding a load.
25241 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
25242 // legal, but EmitLoweredSelect() can not deal with these extensions
25243 // being inserted between two CMOV's. (in i16 case too TBN)
25244 // https://bugs.llvm.org/show_bug.cgi?id=40974
25245 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
25246 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
25247 !X86::mayFoldLoad(Op2, Subtarget))) {
25248 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
25249 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
25250 SDValue Ops[] = { Op2, Op1, CC, Cond };
25251 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
25252 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25253 }
25254
25255 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
25256 // condition is true.
25257 SDValue Ops[] = { Op2, Op1, CC, Cond };
25258 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
25259}
25260
25262 const X86Subtarget &Subtarget,
25263 SelectionDAG &DAG) {
25264 MVT VT = Op->getSimpleValueType(0);
25265 SDValue In = Op->getOperand(0);
25266 MVT InVT = In.getSimpleValueType();
25267 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
25268 MVT VTElt = VT.getVectorElementType();
25269 unsigned NumElts = VT.getVectorNumElements();
25270
25271 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
25272 MVT ExtVT = VT;
25273 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
25274 // If v16i32 is to be avoided, we'll need to split and concatenate.
25275 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
25276 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
25277
25278 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
25279 }
25280
25281 // Widen to 512-bits if VLX is not supported.
25282 MVT WideVT = ExtVT;
25283 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
25284 NumElts *= 512 / ExtVT.getSizeInBits();
25285 InVT = MVT::getVectorVT(MVT::i1, NumElts);
25286 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), In,
25287 DAG.getVectorIdxConstant(0, dl));
25288 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
25289 }
25290
25291 SDValue V;
25292 MVT WideEltVT = WideVT.getVectorElementType();
25293 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
25294 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
25295 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
25296 } else {
25297 SDValue NegOne = DAG.getAllOnesConstant(dl, WideVT);
25298 SDValue Zero = DAG.getConstant(0, dl, WideVT);
25299 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
25300 }
25301
25302 // Truncate if we had to extend i16/i8 above.
25303 if (VT != ExtVT) {
25304 WideVT = MVT::getVectorVT(VTElt, NumElts);
25305 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
25306 }
25307
25308 // Extract back to 128/256-bit if we widened.
25309 if (WideVT != VT)
25310 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
25311 DAG.getVectorIdxConstant(0, dl));
25312
25313 return V;
25314}
25315
25317 SelectionDAG &DAG) {
25318 SDValue In = Op->getOperand(0);
25319 MVT InVT = In.getSimpleValueType();
25320 SDLoc DL(Op);
25321
25322 if (InVT.getVectorElementType() == MVT::i1)
25323 return LowerSIGN_EXTEND_Mask(Op, DL, Subtarget, DAG);
25324
25325 assert(Subtarget.hasAVX() && "Expected AVX support");
25326 return LowerAVXExtend(Op, DL, DAG, Subtarget);
25327}
25328
25329// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
25330// For sign extend this needs to handle all vector sizes and SSE4.1 and
25331// non-SSE4.1 targets. For zero extend this should only handle inputs of
25332// MVT::v64i8 when BWI is not supported, but AVX512 is.
25334 const X86Subtarget &Subtarget,
25335 SelectionDAG &DAG) {
25336 SDValue In = Op->getOperand(0);
25337 MVT VT = Op->getSimpleValueType(0);
25338 MVT InVT = In.getSimpleValueType();
25339
25340 MVT SVT = VT.getVectorElementType();
25341 MVT InSVT = InVT.getVectorElementType();
25343
25344 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
25345 return SDValue();
25346 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
25347 return SDValue();
25348 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
25349 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
25350 !(VT.is512BitVector() && Subtarget.hasAVX512()))
25351 return SDValue();
25352
25353 SDLoc dl(Op);
25354 unsigned Opc = Op.getOpcode();
25355 unsigned NumElts = VT.getVectorNumElements();
25356
25357 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
25358 // For 512-bit vectors, we need 128-bits or 256-bits.
25359 if (InVT.getSizeInBits() > 128) {
25360 // Input needs to be at least the same number of elements as output, and
25361 // at least 128-bits.
25362 int InSize = InSVT.getSizeInBits() * NumElts;
25363 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
25364 InVT = In.getSimpleValueType();
25365 }
25366
25367 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
25368 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
25369 // need to be handled here for 256/512-bit results.
25370 if (Subtarget.hasInt256()) {
25371 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
25372
25373 if (InVT.getVectorNumElements() != NumElts)
25374 return DAG.getNode(Op.getOpcode(), dl, VT, In);
25375
25376 // FIXME: Apparently we create inreg operations that could be regular
25377 // extends.
25378 unsigned ExtOpc =
25381 return DAG.getNode(ExtOpc, dl, VT, In);
25382 }
25383
25384 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
25385 if (Subtarget.hasAVX()) {
25386 assert(VT.is256BitVector() && "256-bit vector expected");
25387 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25388 int HalfNumElts = HalfVT.getVectorNumElements();
25389
25390 unsigned NumSrcElts = InVT.getVectorNumElements();
25391 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
25392 for (int i = 0; i != HalfNumElts; ++i)
25393 HiMask[i] = HalfNumElts + i;
25394
25395 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
25396 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
25397 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
25398 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
25399 }
25400
25401 // We should only get here for sign extend.
25402 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
25403 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
25404 unsigned InNumElts = InVT.getVectorNumElements();
25405
25406 // If the source elements are already all-signbits, we don't need to extend,
25407 // just splat the elements.
25408 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
25409 if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
25410 unsigned Scale = InNumElts / NumElts;
25411 SmallVector<int, 16> ShuffleMask;
25412 for (unsigned I = 0; I != NumElts; ++I)
25413 ShuffleMask.append(Scale, I);
25414 return DAG.getBitcast(VT,
25415 DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
25416 }
25417
25418 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
25419 SDValue Curr = In;
25420 SDValue SignExt = Curr;
25421
25422 // As SRAI is only available on i16/i32 types, we expand only up to i32
25423 // and handle i64 separately.
25424 if (InVT != MVT::v4i32) {
25425 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
25426
25427 unsigned DestWidth = DestVT.getScalarSizeInBits();
25428 unsigned Scale = DestWidth / InSVT.getSizeInBits();
25429 unsigned DestElts = DestVT.getVectorNumElements();
25430
25431 // Build a shuffle mask that takes each input element and places it in the
25432 // MSBs of the new element size.
25433 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
25434 for (unsigned i = 0; i != DestElts; ++i)
25435 Mask[i * Scale + (Scale - 1)] = i;
25436
25437 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
25438 Curr = DAG.getBitcast(DestVT, Curr);
25439
25440 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
25441 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
25442 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
25443 }
25444
25445 if (VT == MVT::v2i64) {
25446 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
25447 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
25448 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
25449 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
25450 SignExt = DAG.getBitcast(VT, SignExt);
25451 }
25452
25453 return SignExt;
25454}
25455
25457 SelectionDAG &DAG) {
25458 MVT VT = Op->getSimpleValueType(0);
25459 SDValue In = Op->getOperand(0);
25460 MVT InVT = In.getSimpleValueType();
25461 SDLoc dl(Op);
25462
25463 if (InVT.getVectorElementType() == MVT::i1)
25464 return LowerSIGN_EXTEND_Mask(Op, dl, Subtarget, DAG);
25465
25466 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
25468 "Expected same number of elements");
25469 assert((VT.getVectorElementType() == MVT::i16 ||
25470 VT.getVectorElementType() == MVT::i32 ||
25471 VT.getVectorElementType() == MVT::i64) &&
25472 "Unexpected element type");
25473 assert((InVT.getVectorElementType() == MVT::i8 ||
25474 InVT.getVectorElementType() == MVT::i16 ||
25475 InVT.getVectorElementType() == MVT::i32) &&
25476 "Unexpected element type");
25477
25478 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
25479 assert(InVT == MVT::v32i8 && "Unexpected VT!");
25480 return splitVectorIntUnary(Op, DAG, dl);
25481 }
25482
25483 if (Subtarget.hasInt256())
25484 return Op;
25485
25486 // Optimize vectors in AVX mode
25487 // Sign extend v8i16 to v8i32 and
25488 // v4i32 to v4i64
25489 //
25490 // Divide input vector into two parts
25491 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
25492 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
25493 // concat the vectors to original VT
25494 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25495 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
25496
25497 unsigned NumElems = InVT.getVectorNumElements();
25498 SmallVector<int,8> ShufMask(NumElems, -1);
25499 for (unsigned i = 0; i != NumElems/2; ++i)
25500 ShufMask[i] = i + NumElems/2;
25501
25502 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
25503 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
25504
25505 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
25506}
25507
25508/// Change a vector store into a pair of half-size vector stores.
25510 SDValue StoredVal = Store->getValue();
25511 assert((StoredVal.getValueType().is256BitVector() ||
25512 StoredVal.getValueType().is512BitVector()) &&
25513 "Expecting 256/512-bit op");
25514
25515 // Splitting volatile memory ops is not allowed unless the operation was not
25516 // legal to begin with. Assume the input store is legal (this transform is
25517 // only used for targets with AVX). Note: It is possible that we have an
25518 // illegal type like v2i128, and so we could allow splitting a volatile store
25519 // in that case if that is important.
25520 if (!Store->isSimple())
25521 return SDValue();
25522
25523 SDLoc DL(Store);
25524 SDValue Value0, Value1;
25525 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
25526 unsigned HalfOffset = Value0.getValueType().getStoreSize();
25527 SDValue Ptr0 = Store->getBasePtr();
25528 SDValue Ptr1 =
25529 DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL);
25530 SDValue Ch0 =
25531 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
25532 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25533 SDValue Ch1 =
25534 DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
25535 Store->getPointerInfo().getWithOffset(HalfOffset),
25536 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25537 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
25538}
25539
25540/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
25541/// type.
25543 SelectionDAG &DAG) {
25544 SDValue StoredVal = Store->getValue();
25545 assert(StoreVT.is128BitVector() &&
25546 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
25547 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
25548
25549 // Splitting volatile memory ops is not allowed unless the operation was not
25550 // legal to begin with. We are assuming the input op is legal (this transform
25551 // is only used for targets with AVX).
25552 if (!Store->isSimple())
25553 return SDValue();
25554
25555 MVT StoreSVT = StoreVT.getScalarType();
25556 unsigned NumElems = StoreVT.getVectorNumElements();
25557 unsigned ScalarSize = StoreSVT.getStoreSize();
25558
25559 SDLoc DL(Store);
25561 for (unsigned i = 0; i != NumElems; ++i) {
25562 unsigned Offset = i * ScalarSize;
25563 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
25565 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
25566 DAG.getVectorIdxConstant(i, DL));
25567 SDValue Ch =
25568 DAG.getStore(Store->getChain(), DL, Scl, Ptr,
25569 Store->getPointerInfo().getWithOffset(Offset),
25570 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25571 Stores.push_back(Ch);
25572 }
25573 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
25574}
25575
25576static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
25577 SelectionDAG &DAG) {
25578 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
25579 SDLoc dl(St);
25580 SDValue StoredVal = St->getValue();
25581
25582 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
25583 if (StoredVal.getValueType().isVector() &&
25584 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
25585 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
25586 assert(NumElts <= 8 && "Unexpected VT");
25587 assert(!St->isTruncatingStore() && "Expected non-truncating store");
25588 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25589 "Expected AVX512F without AVX512DQI");
25590
25591 // We must pad with zeros to ensure we store zeroes to any unused bits.
25592 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25593 DAG.getUNDEF(MVT::v16i1), StoredVal,
25594 DAG.getVectorIdxConstant(0, dl));
25595 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
25596 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
25597 // Make sure we store zeros in the extra bits.
25598 if (NumElts < 8)
25599 StoredVal = DAG.getZeroExtendInReg(
25600 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
25601
25602 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25603 St->getPointerInfo(), St->getBaseAlign(),
25604 St->getMemOperand()->getFlags());
25605 }
25606
25607 if (St->isTruncatingStore())
25608 return SDValue();
25609
25610 // If this is a 256/512-bit store of concatenated ops, we are better off
25611 // splitting that store into two half-size stores. This avoids spurious use of
25612 // concatenated ops and each half can execute independently. Some cores would
25613 // split the op into halves anyway, so the concat is purely an extra op.
25614 MVT StoreVT = StoredVal.getSimpleValueType();
25615 if (StoreVT.is256BitVector() || StoreVT.is512BitVector()) {
25616 if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal, DAG))
25617 return splitVectorStore(St, DAG);
25618 return SDValue();
25619 }
25620
25621 if (StoreVT.is32BitVector())
25622 return SDValue();
25623
25624 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25625 assert(StoreVT.is64BitVector() && "Unexpected VT");
25626 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
25628 "Unexpected type action!");
25629
25630 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
25631 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
25632 DAG.getUNDEF(StoreVT));
25633
25634 if (Subtarget.hasSSE2()) {
25635 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
25636 // and store it.
25637 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
25638 MVT CastVT = MVT::getVectorVT(StVT, 2);
25639 StoredVal = DAG.getBitcast(CastVT, StoredVal);
25640 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
25641 DAG.getVectorIdxConstant(0, dl));
25642
25643 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25644 St->getPointerInfo(), St->getBaseAlign(),
25645 St->getMemOperand()->getFlags());
25646 }
25647 assert(Subtarget.hasSSE1() && "Expected SSE");
25648 SDVTList Tys = DAG.getVTList(MVT::Other);
25649 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
25650 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
25651 St->getMemOperand());
25652}
25653
25654// Lower vector extended loads using a shuffle. If SSSE3 is not available we
25655// may emit an illegal shuffle but the expansion is still better than scalar
25656// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
25657// we'll emit a shuffle and a arithmetic shift.
25658// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
25659// TODO: It is possible to support ZExt by zeroing the undef values during
25660// the shuffle phase or after the shuffle.
25661static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
25662 SelectionDAG &DAG) {
25663 MVT RegVT = Op.getSimpleValueType();
25664 assert(RegVT.isVector() && "We only custom lower vector loads.");
25665 assert(RegVT.isInteger() &&
25666 "We only custom lower integer vector loads.");
25667
25668 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
25669 SDLoc dl(Ld);
25670
25671 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
25672 if (RegVT.getVectorElementType() == MVT::i1) {
25673 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
25674 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
25675 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25676 "Expected AVX512F without AVX512DQI");
25677
25678 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
25679 Ld->getPointerInfo(), Ld->getBaseAlign(),
25680 Ld->getMemOperand()->getFlags());
25681
25682 // Replace chain users with the new chain.
25683 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
25684
25685 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
25686 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
25687 DAG.getBitcast(MVT::v16i1, Val),
25688 DAG.getVectorIdxConstant(0, dl));
25689 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
25690 }
25691
25692 return SDValue();
25693}
25694
25695/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
25696/// each of which has no other use apart from the AND / OR.
25697static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
25698 Opc = Op.getOpcode();
25699 if (Opc != ISD::OR && Opc != ISD::AND)
25700 return false;
25701 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
25702 Op.getOperand(0).hasOneUse() &&
25703 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
25704 Op.getOperand(1).hasOneUse());
25705}
25706
25707SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
25708 SDValue Chain = Op.getOperand(0);
25709 SDValue Cond = Op.getOperand(1);
25710 SDValue Dest = Op.getOperand(2);
25711 SDLoc dl(Op);
25712
25713 // Bail out when we don't have native compare instructions.
25714 if (Cond.getOpcode() == ISD::SETCC &&
25715 Cond.getOperand(0).getValueType() != MVT::f128 &&
25716 !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
25717 SDValue LHS = Cond.getOperand(0);
25718 SDValue RHS = Cond.getOperand(1);
25719 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
25720
25721 // Special case for
25722 // setcc([su]{add,sub,mul}o == 0)
25723 // setcc([su]{add,sub,mul}o != 1)
25725 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
25727 SDValue Value, Overflow;
25728 X86::CondCode X86Cond;
25729 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
25730
25731 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
25732 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
25733
25734 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25735 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25736 Overflow, Op->getFlags());
25737 }
25738
25739 if (LHS.getSimpleValueType().isInteger()) {
25740 SDValue CCVal;
25741 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
25742 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25743 EFLAGS, Op->getFlags());
25744 }
25745
25746 if (CC == ISD::SETOEQ) {
25747 // For FCMP_OEQ, we can emit
25748 // two branches instead of an explicit AND instruction with a
25749 // separate test. However, we only do this if this block doesn't
25750 // have a fall-through edge, because this requires an explicit
25751 // jmp when the condition is false.
25752 if (Op.getNode()->hasOneUse()) {
25753 SDNode *User = *Op.getNode()->user_begin();
25754 // Look for an unconditional branch following this conditional branch.
25755 // We need this because we need to reverse the successors in order
25756 // to implement FCMP_OEQ.
25757 if (User->getOpcode() == ISD::BR) {
25758 SDValue FalseBB = User->getOperand(1);
25759 SDNode *NewBR =
25760 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
25761 assert(NewBR == User);
25762 (void)NewBR;
25763 Dest = FalseBB;
25764
25765 SDValue Cmp =
25766 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25767 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25768 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
25769 CCVal, Cmp, Op->getFlags());
25770 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25771 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25772 Cmp, Op->getFlags());
25773 }
25774 }
25775 } else if (CC == ISD::SETUNE) {
25776 // For FCMP_UNE, we can emit
25777 // two branches instead of an explicit OR instruction with a
25778 // separate test.
25779 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25780 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25781 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25782 Cmp, Op->getFlags());
25783 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25784 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25785 Cmp, Op->getFlags());
25786 } else {
25787 X86::CondCode X86Cond =
25788 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
25789 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25790 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25791 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25792 Cmp, Op->getFlags());
25793 }
25794 }
25795
25797 SDValue Value, Overflow;
25798 X86::CondCode X86Cond;
25799 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25800
25801 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25802 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25803 Overflow, Op->getFlags());
25804 }
25805
25806 // Look past the truncate if the high bits are known zero.
25808 Cond = Cond.getOperand(0);
25809
25810 EVT CondVT = Cond.getValueType();
25811
25812 // Add an AND with 1 if we don't already have one.
25813 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
25814 Cond =
25815 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
25816
25817 SDValue LHS = Cond;
25818 SDValue RHS = DAG.getConstant(0, dl, CondVT);
25819
25820 SDValue CCVal;
25821 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
25822 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, EFLAGS,
25823 Op->getFlags());
25824}
25825
25826// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
25827// Calls to _alloca are needed to probe the stack when allocating more than 4k
25828// bytes in one go. Touching the stack at 4K increments is necessary to ensure
25829// that the guard pages used by the OS virtual memory manager are allocated in
25830// correct sequence.
25831SDValue
25832X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
25833 SelectionDAG &DAG) const {
25834 MachineFunction &MF = DAG.getMachineFunction();
25835 bool SplitStack = MF.shouldSplitStack();
25836 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
25837 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
25838 SplitStack || EmitStackProbeCall;
25839 SDLoc dl(Op);
25840
25841 // Get the inputs.
25842 SDNode *Node = Op.getNode();
25843 SDValue Chain = Op.getOperand(0);
25844 SDValue Size = Op.getOperand(1);
25845 MaybeAlign Alignment(Op.getConstantOperandVal(2));
25846 EVT VT = Node->getValueType(0);
25847
25848 // Chain the dynamic stack allocation so that it doesn't modify the stack
25849 // pointer when other instructions are using the stack.
25850 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
25851
25852 bool Is64Bit = Subtarget.is64Bit();
25853 MVT SPTy = Op.getValueType().getSimpleVT();
25854
25856 if (!Lower) {
25857 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25859 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
25860 " not tell us which reg is the stack pointer!");
25861
25862 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
25863 const Align StackAlign = TFI.getStackAlign();
25864 if (hasInlineStackProbe(MF)) {
25865 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, {SPTy, MVT::Other},
25866 {Chain, Size});
25867 Chain = Result.getValue(1);
25868 } else {
25869 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
25870 Chain = SP.getValue(1);
25871 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
25872 }
25873 if (Alignment && *Alignment > StackAlign)
25874 Result = DAG.getNode(
25875 ISD::AND, dl, VT, Result,
25876 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25877 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
25878 } else if (SplitStack) {
25879 if (Is64Bit) {
25880 // The 64 bit implementation of segmented stacks needs to clobber both r10
25881 // r11. This makes it impossible to use it along with nested parameters.
25882 const Function &F = MF.getFunction();
25883 for (const auto &A : F.args()) {
25884 if (A.hasNestAttr())
25885 report_fatal_error("Cannot use segmented stacks with functions that "
25886 "have nested arguments.");
25887 }
25888 }
25889
25890 Result =
25891 DAG.getNode(X86ISD::SEG_ALLOCA, dl, {SPTy, MVT::Other}, {Chain, Size});
25892 Chain = Result.getValue(1);
25893 } else {
25894 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
25895 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
25896 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
25897
25898 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25899 Register SPReg = RegInfo->getStackRegister();
25900 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
25901 Chain = SP.getValue(1);
25902
25903 if (Alignment) {
25904 SP = DAG.getNode(
25905 ISD::AND, dl, VT, SP.getValue(0),
25906 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25907 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
25908 }
25909
25910 Result = SP;
25911 }
25912
25913 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
25914
25915 SDValue Ops[2] = {Result, Chain};
25916 return DAG.getMergeValues(Ops, dl);
25917}
25918
25919SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
25920 MachineFunction &MF = DAG.getMachineFunction();
25921 SDValue Ptr = Op.getOperand(1);
25922 EVT PtrVT = Ptr.getValueType();
25923
25924 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
25925
25926 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25927 SDLoc DL(Op);
25928
25929 if (!Subtarget.is64Bit() ||
25930 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
25931 // vastart just stores the address of the VarArgsFrameIndex slot into the
25932 // memory location argument.
25933 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25934 return DAG.getStore(Op.getOperand(0), DL, FR, Ptr, MachinePointerInfo(SV));
25935 }
25936
25937 // __va_list_tag:
25938 // gp_offset (0 - 6 * 8)
25939 // fp_offset (48 - 48 + 8 * 16)
25940 // overflow_arg_area (point to parameters coming in memory).
25941 // reg_save_area
25943 SDValue FIN = Op.getOperand(1);
25944 // Store gp_offset
25945 SDValue Store = DAG.getStore(
25946 Op.getOperand(0), DL,
25947 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25948 MachinePointerInfo(SV));
25949 MemOps.push_back(Store);
25950
25951 // Store fp_offset
25952 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::getFixed(4), DL);
25953 Store = DAG.getStore(
25954 Op.getOperand(0), DL,
25955 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25956 MachinePointerInfo(SV, 4));
25957 MemOps.push_back(Store);
25958
25959 // Store ptr to overflow_arg_area
25960 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25961 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25962 Store =
25963 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25964 MemOps.push_back(Store);
25965
25966 // Store ptr to reg_save_area.
25967 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25968 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25969 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25970 Store = DAG.getStore(
25971 Op.getOperand(0), DL, RSFIN, FIN,
25972 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25973 MemOps.push_back(Store);
25974 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25975}
25976
25977SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25978 assert(Subtarget.is64Bit() &&
25979 "LowerVAARG only handles 64-bit va_arg!");
25980 assert(Op.getNumOperands() == 4);
25981
25982 MachineFunction &MF = DAG.getMachineFunction();
25983 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25984 // The Win64 ABI uses char* instead of a structure.
25985 return DAG.expandVAArg(Op.getNode());
25986
25987 SDValue Chain = Op.getOperand(0);
25988 SDValue SrcPtr = Op.getOperand(1);
25989 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25990 unsigned Align = Op.getConstantOperandVal(3);
25991 SDLoc dl(Op);
25992
25993 EVT ArgVT = Op.getNode()->getValueType(0);
25994 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25995 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25996 uint8_t ArgMode;
25997
25998 // Decide which area this value should be read from.
25999 // TODO: Implement the AMD64 ABI in its entirety. This simple
26000 // selection mechanism works only for the basic types.
26001 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
26002 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
26003 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
26004 } else {
26005 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
26006 "Unhandled argument type in LowerVAARG");
26007 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
26008 }
26009
26010 if (ArgMode == 2) {
26011 // Make sure using fp_offset makes sense.
26012 assert(!Subtarget.useSoftFloat() &&
26013 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
26014 Subtarget.hasSSE1());
26015 }
26016
26017 // Insert VAARG node into the DAG
26018 // VAARG returns two values: Variable Argument Address, Chain
26019 SDValue InstOps[] = {Chain, SrcPtr,
26020 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
26021 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
26022 DAG.getTargetConstant(Align, dl, MVT::i32)};
26023 SDVTList VTs = DAG.getVTList(SrcPtr.getValueType(), MVT::Other);
26024 SDValue VAARG = DAG.getMemIntrinsicNode(
26025 Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
26026 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
26027 /*Alignment=*/std::nullopt,
26029 Chain = VAARG.getValue(1);
26030
26031 // Load the next argument and return it
26032 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
26033}
26034
26035static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
26036 SelectionDAG &DAG) {
26037 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
26038 // where a va_list is still an i8*.
26039 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
26040 if (Subtarget.isCallingConvWin64(
26042 // Probably a Win64 va_copy.
26043 return DAG.expandVACopy(Op.getNode());
26044
26045 SDValue Chain = Op.getOperand(0);
26046 SDValue DstPtr = Op.getOperand(1);
26047 SDValue SrcPtr = Op.getOperand(2);
26048 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
26049 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
26050 SDLoc DL(Op);
26051
26052 return DAG.getMemcpy(
26053 Chain, DL, DstPtr, SrcPtr,
26054 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
26055 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
26056 /*CI=*/nullptr, std::nullopt, MachinePointerInfo(DstSV),
26057 MachinePointerInfo(SrcSV));
26058}
26059
26060// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
26061static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
26062 switch (Opc) {
26063 case ISD::SHL:
26064 case X86ISD::VSHL:
26065 case X86ISD::VSHLI:
26066 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
26067 case ISD::SRL:
26068 case X86ISD::VSRL:
26069 case X86ISD::VSRLI:
26070 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
26071 case ISD::SRA:
26072 case X86ISD::VSRA:
26073 case X86ISD::VSRAI:
26074 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
26075 }
26076 llvm_unreachable("Unknown target vector shift node");
26077}
26078
26079/// Handle vector element shifts where the shift amount is a constant.
26080/// Takes immediate version of shift as input.
26081static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
26082 SDValue SrcOp, uint64_t ShiftAmt,
26083 SelectionDAG &DAG) {
26084 MVT ElementType = VT.getVectorElementType();
26085
26086 // Bitcast the source vector to the output type, this is mainly necessary for
26087 // vXi8/vXi64 shifts.
26088 if (VT != SrcOp.getSimpleValueType())
26089 SrcOp = DAG.getBitcast(VT, SrcOp);
26090
26091 // Fold this packed shift into its first operand if ShiftAmt is 0.
26092 if (ShiftAmt == 0)
26093 return SrcOp;
26094
26095 // Check for ShiftAmt >= element width
26096 if (ShiftAmt >= ElementType.getSizeInBits()) {
26097 if (Opc == X86ISD::VSRAI)
26098 ShiftAmt = ElementType.getSizeInBits() - 1;
26099 else
26100 return DAG.getConstant(0, dl, VT);
26101 }
26102
26104 && "Unknown target vector shift-by-constant node");
26105
26106 // Fold this packed vector shift into a build vector if SrcOp is a
26107 // vector of Constants or UNDEFs.
26109 unsigned ShiftOpc;
26110 switch (Opc) {
26111 default: llvm_unreachable("Unknown opcode!");
26112 case X86ISD::VSHLI:
26113 ShiftOpc = ISD::SHL;
26114 break;
26115 case X86ISD::VSRLI:
26116 ShiftOpc = ISD::SRL;
26117 break;
26118 case X86ISD::VSRAI:
26119 ShiftOpc = ISD::SRA;
26120 break;
26121 }
26122
26123 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
26124 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
26125 return C;
26126 }
26127
26128 return DAG.getNode(Opc, dl, VT, SrcOp,
26129 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
26130}
26131
26132/// Handle vector element shifts by a splat shift amount
26133static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
26134 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
26135 const X86Subtarget &Subtarget,
26136 SelectionDAG &DAG) {
26137 MVT AmtVT = ShAmt.getSimpleValueType();
26138 assert(AmtVT.isVector() && "Vector shift type mismatch");
26139 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
26140 "Illegal vector splat index");
26141
26142 // Move the splat element to the bottom element.
26143 if (ShAmtIdx != 0) {
26144 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
26145 Mask[0] = ShAmtIdx;
26146 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
26147 }
26148
26149 // Peek through any zext node if we can get back to a 128-bit source.
26150 if (AmtVT.getScalarSizeInBits() == 64 &&
26151 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
26153 ShAmt.getOperand(0).getValueType().isSimple() &&
26154 ShAmt.getOperand(0).getValueType().is128BitVector()) {
26155 ShAmt = ShAmt.getOperand(0);
26156 AmtVT = ShAmt.getSimpleValueType();
26157 }
26158
26159 // See if we can mask off the upper elements using the existing source node.
26160 // The shift uses the entire lower 64-bits of the amount vector, so no need to
26161 // do this for vXi64 types.
26162 bool IsMasked = false;
26163 if (AmtVT.getScalarSizeInBits() < 64) {
26164 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
26165 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
26166 // If the shift amount has come from a scalar, then zero-extend the scalar
26167 // before moving to the vector.
26168 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
26169 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26170 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
26171 AmtVT = MVT::v4i32;
26172 IsMasked = true;
26173 } else if (ShAmt.getOpcode() == ISD::AND) {
26174 // See if the shift amount is already masked (e.g. for rotation modulo),
26175 // then we can zero-extend it by setting all the other mask elements to
26176 // zero.
26177 SmallVector<SDValue> MaskElts(
26178 AmtVT.getVectorNumElements(),
26179 DAG.getConstant(0, dl, AmtVT.getScalarType()));
26180 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
26181 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
26182 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
26183 {ShAmt.getOperand(1), Mask}))) {
26184 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
26185 IsMasked = true;
26186 }
26187 }
26188 }
26189
26190 // Extract if the shift amount vector is larger than 128-bits.
26191 if (AmtVT.getSizeInBits() > 128) {
26192 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
26193 AmtVT = ShAmt.getSimpleValueType();
26194 }
26195
26196 // Zero-extend bottom element to v2i64 vector type, either by extension or
26197 // shuffle masking.
26198 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
26199 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
26200 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
26201 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
26202 } else if (Subtarget.hasSSE41()) {
26203 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
26204 MVT::v2i64, ShAmt);
26205 } else {
26206 SDValue ByteShift = DAG.getTargetConstant(
26207 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
26208 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
26209 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26210 ByteShift);
26211 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26212 ByteShift);
26213 }
26214 }
26215
26216 // Change opcode to non-immediate version.
26218
26219 // The return type has to be a 128-bit type with the same element
26220 // type as the input type.
26221 MVT EltVT = VT.getVectorElementType();
26222 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
26223
26224 ShAmt = DAG.getBitcast(ShVT, ShAmt);
26225 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
26226}
26227
26228/// Return Mask with the necessary casting or extending
26229/// for \p Mask according to \p MaskVT when lowering masking intrinsics
26230static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
26231 const X86Subtarget &Subtarget, SelectionDAG &DAG,
26232 const SDLoc &dl) {
26233
26234 if (isAllOnesConstant(Mask))
26235 return DAG.getConstant(1, dl, MaskVT);
26236 if (X86::isZeroNode(Mask))
26237 return DAG.getConstant(0, dl, MaskVT);
26238
26239 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
26240
26241 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
26242 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
26243 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
26244 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
26245 SDValue Lo, Hi;
26246 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
26247 Lo = DAG.getBitcast(MVT::v32i1, Lo);
26248 Hi = DAG.getBitcast(MVT::v32i1, Hi);
26249 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
26250 } else {
26251 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
26252 Mask.getSimpleValueType().getSizeInBits());
26253 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
26254 // are extracted by EXTRACT_SUBVECTOR.
26255 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
26256 DAG.getBitcast(BitcastVT, Mask),
26257 DAG.getVectorIdxConstant(0, dl));
26258 }
26259}
26260
26261/// Return (and \p Op, \p Mask) for compare instructions or
26262/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
26263/// necessary casting or extending for \p Mask when lowering masking intrinsics
26265 SDValue PreservedSrc,
26266 const X86Subtarget &Subtarget,
26267 SelectionDAG &DAG) {
26268 MVT VT = Op.getSimpleValueType();
26269 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
26270 unsigned OpcodeSelect = ISD::VSELECT;
26271 SDLoc dl(Op);
26272
26273 if (isAllOnesConstant(Mask))
26274 return Op;
26275
26276 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26277
26278 if (PreservedSrc.isUndef())
26279 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26280 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
26281}
26282
26283/// Creates an SDNode for a predicated scalar operation.
26284/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
26285/// The mask is coming as MVT::i8 and it should be transformed
26286/// to MVT::v1i1 while lowering masking intrinsics.
26287/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
26288/// "X86select" instead of "vselect". We just can't create the "vselect" node
26289/// for a scalar instruction.
26291 SDValue PreservedSrc,
26292 const X86Subtarget &Subtarget,
26293 SelectionDAG &DAG) {
26294 auto *MaskConst = dyn_cast<ConstantSDNode>(Mask);
26295 if (MaskConst && (MaskConst->getZExtValue() & 0x1))
26296 return Op;
26297
26298 MVT VT = Op.getSimpleValueType();
26299 SDLoc dl(Op);
26300
26301 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
26302 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
26303 DAG.getBitcast(MVT::v8i1, Mask),
26304 DAG.getVectorIdxConstant(0, dl));
26305 if (Op.getOpcode() == X86ISD::FSETCCM ||
26306 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
26307 Op.getOpcode() == X86ISD::VFPCLASSS)
26308 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
26309
26310 if (PreservedSrc.isUndef())
26311 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26312
26313 if (MaskConst) {
26314 assert((MaskConst->getZExtValue() & 0x1) == 0 && "Expected false mask");
26315 // Discard op and blend passthrough with scalar op src/dst.
26317 std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
26318 ShuffleMask[0] = VT.getVectorNumElements();
26319 return DAG.getVectorShuffle(VT, dl, Op.getOperand(0), PreservedSrc,
26320 ShuffleMask);
26321 }
26322
26323 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
26324}
26325
26327 if (!Fn->hasPersonalityFn())
26329 "querying registration node size for function without personality");
26330 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
26331 // WinEHStatePass for the full struct definition.
26332 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
26333 case EHPersonality::MSVC_X86SEH: return 24;
26334 case EHPersonality::MSVC_CXX: return 16;
26335 default: break;
26336 }
26338 "can only recover FP for 32-bit MSVC EH personality functions");
26339}
26340
26341/// When the MSVC runtime transfers control to us, either to an outlined
26342/// function or when returning to a parent frame after catching an exception, we
26343/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
26344/// Here's the math:
26345/// RegNodeBase = EntryEBP - RegNodeSize
26346/// ParentFP = RegNodeBase - ParentFrameOffset
26347/// Subtracting RegNodeSize takes us to the offset of the registration node, and
26348/// subtracting the offset (negative on x86) takes us back to the parent FP.
26350 SDValue EntryEBP) {
26352 SDLoc dl;
26353
26354 // It's possible that the parent function no longer has a personality function
26355 // if the exceptional code was optimized away, in which case we just return
26356 // the incoming EBP.
26357 if (!Fn->hasPersonalityFn())
26358 return EntryEBP;
26359
26360 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
26361 // registration, or the .set_setframe offset.
26364 MVT PtrVT = EntryEBP.getValueType().getSimpleVT();
26365 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
26366 SDValue ParentFrameOffset =
26367 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
26368
26369 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
26370 // prologue to RBP in the parent function.
26371 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
26372 if (Subtarget.is64Bit())
26373 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
26374
26375 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
26376 // RegNodeBase = EntryEBP - RegNodeSize
26377 // ParentFP = RegNodeBase - ParentFrameOffset
26378 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
26379 DAG.getConstant(RegNodeSize, dl, PtrVT));
26380 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
26381}
26382
26383SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
26384 SelectionDAG &DAG) const {
26385 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
26386 auto isRoundModeCurDirection = [](SDValue Rnd) {
26387 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
26388 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
26389
26390 return false;
26391 };
26392 auto isRoundModeSAE = [](SDValue Rnd) {
26393 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26394 unsigned RC = C->getZExtValue();
26396 // Clear the NO_EXC bit and check remaining bits.
26398 // As a convenience we allow no other bits or explicitly
26399 // current direction.
26400 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
26401 }
26402 }
26403
26404 return false;
26405 };
26406 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
26407 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26408 RC = C->getZExtValue();
26410 // Clear the NO_EXC bit and check remaining bits.
26416 }
26417 }
26418
26419 return false;
26420 };
26421
26422 SDLoc dl(Op);
26423 unsigned IntNo = Op.getConstantOperandVal(0);
26424 MVT VT = Op.getSimpleValueType();
26425 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
26426
26427 // Propagate flags from original node to transformed node(s).
26428 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
26429
26430 if (IntrData) {
26431 switch(IntrData->Type) {
26432 case INTR_TYPE_1OP: {
26433 // We specify 2 possible opcodes for intrinsics with rounding modes.
26434 // First, we check if the intrinsic may have non-default rounding mode,
26435 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26436 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26437 if (IntrWithRoundingModeOpcode != 0) {
26438 SDValue Rnd = Op.getOperand(2);
26439 unsigned RC = 0;
26440 if (isRoundModeSAEToX(Rnd, RC))
26441 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26442 Op.getOperand(1),
26443 DAG.getTargetConstant(RC, dl, MVT::i32));
26444 if (!isRoundModeCurDirection(Rnd))
26445 return SDValue();
26446 }
26447 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26448 Op.getOperand(1));
26449 }
26450 case INTR_TYPE_1OP_SAE: {
26451 SDValue Sae = Op.getOperand(2);
26452
26453 unsigned Opc;
26454 if (isRoundModeCurDirection(Sae))
26455 Opc = IntrData->Opc0;
26456 else if (isRoundModeSAE(Sae))
26457 Opc = IntrData->Opc1;
26458 else
26459 return SDValue();
26460
26461 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
26462 }
26463 case INTR_TYPE_2OP: {
26464 SDValue Src2 = Op.getOperand(2);
26465
26466 // We specify 2 possible opcodes for intrinsics with rounding modes.
26467 // First, we check if the intrinsic may have non-default rounding mode,
26468 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26469 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26470 if (IntrWithRoundingModeOpcode != 0) {
26471 SDValue Rnd = Op.getOperand(3);
26472 unsigned RC = 0;
26473 if (isRoundModeSAEToX(Rnd, RC))
26474 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26475 Op.getOperand(1), Src2,
26476 DAG.getTargetConstant(RC, dl, MVT::i32));
26477 if (!isRoundModeCurDirection(Rnd))
26478 return SDValue();
26479 }
26480
26481 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26482 Op.getOperand(1), Src2);
26483 }
26484 case INTR_TYPE_2OP_SAE: {
26485 SDValue Sae = Op.getOperand(3);
26486
26487 unsigned Opc;
26488 if (isRoundModeCurDirection(Sae))
26489 Opc = IntrData->Opc0;
26490 else if (isRoundModeSAE(Sae))
26491 Opc = IntrData->Opc1;
26492 else
26493 return SDValue();
26494
26495 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
26496 Op.getOperand(2));
26497 }
26498 case INTR_TYPE_3OP:
26499 case INTR_TYPE_3OP_IMM8: {
26500 SDValue Src1 = Op.getOperand(1);
26501 SDValue Src2 = Op.getOperand(2);
26502 SDValue Src3 = Op.getOperand(3);
26503
26504 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
26505 Src3.getValueType() != MVT::i8) {
26506 Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
26507 }
26508
26509 // We specify 2 possible opcodes for intrinsics with rounding modes.
26510 // First, we check if the intrinsic may have non-default rounding mode,
26511 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26512 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26513 if (IntrWithRoundingModeOpcode != 0) {
26514 SDValue Rnd = Op.getOperand(4);
26515 unsigned RC = 0;
26516 if (isRoundModeSAEToX(Rnd, RC))
26517 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26518 Src1, Src2, Src3,
26519 DAG.getTargetConstant(RC, dl, MVT::i32));
26520 if (!isRoundModeCurDirection(Rnd))
26521 return SDValue();
26522 }
26523
26524 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26525 {Src1, Src2, Src3});
26526 }
26527 case INTR_TYPE_4OP_IMM8: {
26528 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
26529 SDValue Src4 = Op.getOperand(4);
26530 if (Src4.getValueType() != MVT::i8) {
26531 Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
26532 }
26533
26534 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26535 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
26536 Src4);
26537 }
26538 case INTR_TYPE_1OP_MASK: {
26539 SDValue Src = Op.getOperand(1);
26540 SDValue PassThru = Op.getOperand(2);
26541 SDValue Mask = Op.getOperand(3);
26542 // We add rounding mode to the Node when
26543 // - RC Opcode is specified and
26544 // - RC is not "current direction".
26545 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26546 if (IntrWithRoundingModeOpcode != 0) {
26547 SDValue Rnd = Op.getOperand(4);
26548 unsigned RC = 0;
26549 if (isRoundModeSAEToX(Rnd, RC))
26550 return getVectorMaskingNode(
26551 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26552 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
26553 Mask, PassThru, Subtarget, DAG);
26554 if (!isRoundModeCurDirection(Rnd))
26555 return SDValue();
26556 }
26557 return getVectorMaskingNode(
26558 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
26559 Subtarget, DAG);
26560 }
26562 SDValue Src = Op.getOperand(1);
26563 SDValue PassThru = Op.getOperand(2);
26564 SDValue Mask = Op.getOperand(3);
26565 SDValue Rnd = Op.getOperand(4);
26566
26567 unsigned Opc;
26568 if (isRoundModeCurDirection(Rnd))
26569 Opc = IntrData->Opc0;
26570 else if (isRoundModeSAE(Rnd))
26571 Opc = IntrData->Opc1;
26572 else
26573 return SDValue();
26574
26575 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
26576 Subtarget, DAG);
26577 }
26578 case INTR_TYPE_SCALAR_MASK: {
26579 SDValue Src1 = Op.getOperand(1);
26580 SDValue Src2 = Op.getOperand(2);
26581 SDValue passThru = Op.getOperand(3);
26582 SDValue Mask = Op.getOperand(4);
26583 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26584 // There are 2 kinds of intrinsics in this group:
26585 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
26586 // (2) With rounding mode and sae - 7 operands.
26587 bool HasRounding = IntrWithRoundingModeOpcode != 0;
26588 if (Op.getNumOperands() == (5U + HasRounding)) {
26589 if (HasRounding) {
26590 SDValue Rnd = Op.getOperand(5);
26591 unsigned RC = 0;
26592 if (isRoundModeSAEToX(Rnd, RC))
26593 return getScalarMaskingNode(
26594 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
26595 DAG.getTargetConstant(RC, dl, MVT::i32)),
26596 Mask, passThru, Subtarget, DAG);
26597 if (!isRoundModeCurDirection(Rnd))
26598 return SDValue();
26599 }
26600 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
26601 Src2),
26602 Mask, passThru, Subtarget, DAG);
26603 }
26604
26605 assert(Op.getNumOperands() == (6U + HasRounding) &&
26606 "Unexpected intrinsic form");
26607 SDValue RoundingMode = Op.getOperand(5);
26608 unsigned Opc = IntrData->Opc0;
26609 if (HasRounding) {
26610 SDValue Sae = Op.getOperand(6);
26611 if (isRoundModeSAE(Sae))
26612 Opc = IntrWithRoundingModeOpcode;
26613 else if (!isRoundModeCurDirection(Sae))
26614 return SDValue();
26615 }
26616 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
26617 Src2, RoundingMode),
26618 Mask, passThru, Subtarget, DAG);
26619 }
26621 SDValue Src1 = Op.getOperand(1);
26622 SDValue Src2 = Op.getOperand(2);
26623 SDValue passThru = Op.getOperand(3);
26624 SDValue Mask = Op.getOperand(4);
26625 SDValue Rnd = Op.getOperand(5);
26626
26627 SDValue NewOp;
26628 unsigned RC = 0;
26629 if (isRoundModeCurDirection(Rnd))
26630 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26631 else if (isRoundModeSAEToX(Rnd, RC))
26632 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26633 DAG.getTargetConstant(RC, dl, MVT::i32));
26634 else
26635 return SDValue();
26636
26637 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
26638 }
26640 SDValue Src1 = Op.getOperand(1);
26641 SDValue Src2 = Op.getOperand(2);
26642 SDValue passThru = Op.getOperand(3);
26643 SDValue Mask = Op.getOperand(4);
26644 SDValue Sae = Op.getOperand(5);
26645 unsigned Opc;
26646 if (isRoundModeCurDirection(Sae))
26647 Opc = IntrData->Opc0;
26648 else if (isRoundModeSAE(Sae))
26649 Opc = IntrData->Opc1;
26650 else
26651 return SDValue();
26652
26653 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26654 Mask, passThru, Subtarget, DAG);
26655 }
26656 case INTR_TYPE_2OP_MASK: {
26657 SDValue Src1 = Op.getOperand(1);
26658 SDValue Src2 = Op.getOperand(2);
26659 SDValue PassThru = Op.getOperand(3);
26660 SDValue Mask = Op.getOperand(4);
26661 SDValue NewOp;
26662 if (IntrData->Opc1 != 0) {
26663 SDValue Rnd = Op.getOperand(5);
26664 unsigned RC = 0;
26665 if (isRoundModeSAEToX(Rnd, RC))
26666 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26667 DAG.getTargetConstant(RC, dl, MVT::i32));
26668 else if (!isRoundModeCurDirection(Rnd))
26669 return SDValue();
26670 }
26671 if (!NewOp)
26672 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26673 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26674 }
26676 SDValue Src1 = Op.getOperand(1);
26677 SDValue Src2 = Op.getOperand(2);
26678 SDValue PassThru = Op.getOperand(3);
26679 SDValue Mask = Op.getOperand(4);
26680
26681 unsigned Opc = IntrData->Opc0;
26682 if (IntrData->Opc1 != 0) {
26683 SDValue Sae = Op.getOperand(5);
26684 if (isRoundModeSAE(Sae))
26685 Opc = IntrData->Opc1;
26686 else if (!isRoundModeCurDirection(Sae))
26687 return SDValue();
26688 }
26689
26690 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26691 Mask, PassThru, Subtarget, DAG);
26692 }
26694 SDValue Src1 = Op.getOperand(1);
26695 SDValue Src2 = Op.getOperand(2);
26696 SDValue Src3 = Op.getOperand(3);
26697 SDValue PassThru = Op.getOperand(4);
26698 SDValue Mask = Op.getOperand(5);
26699 SDValue Sae = Op.getOperand(6);
26700 unsigned Opc;
26701 if (isRoundModeCurDirection(Sae))
26702 Opc = IntrData->Opc0;
26703 else if (isRoundModeSAE(Sae))
26704 Opc = IntrData->Opc1;
26705 else
26706 return SDValue();
26707
26708 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26709 Mask, PassThru, Subtarget, DAG);
26710 }
26712 SDValue Src1 = Op.getOperand(1);
26713 SDValue Src2 = Op.getOperand(2);
26714 SDValue Src3 = Op.getOperand(3);
26715 SDValue PassThru = Op.getOperand(4);
26716 SDValue Mask = Op.getOperand(5);
26717
26718 unsigned Opc = IntrData->Opc0;
26719 if (IntrData->Opc1 != 0) {
26720 SDValue Sae = Op.getOperand(6);
26721 if (isRoundModeSAE(Sae))
26722 Opc = IntrData->Opc1;
26723 else if (!isRoundModeCurDirection(Sae))
26724 return SDValue();
26725 }
26726 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26727 Mask, PassThru, Subtarget, DAG);
26728 }
26729 case BLENDV: {
26730 SDValue Src1 = Op.getOperand(1);
26731 SDValue Src2 = Op.getOperand(2);
26732 SDValue Src3 = Op.getOperand(3);
26733
26734 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
26735 Src3 = DAG.getBitcast(MaskVT, Src3);
26736
26737 // Reverse the operands to match VSELECT order.
26738 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
26739 }
26740 case VPERM_2OP : {
26741 SDValue Src1 = Op.getOperand(1);
26742 SDValue Src2 = Op.getOperand(2);
26743
26744 // Swap Src1 and Src2 in the node creation
26745 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
26746 }
26747 case CFMA_OP_MASKZ:
26748 case CFMA_OP_MASK: {
26749 SDValue Src1 = Op.getOperand(1);
26750 SDValue Src2 = Op.getOperand(2);
26751 SDValue Src3 = Op.getOperand(3);
26752 SDValue Mask = Op.getOperand(4);
26753 MVT VT = Op.getSimpleValueType();
26754
26755 SDValue PassThru = Src3;
26756 if (IntrData->Type == CFMA_OP_MASKZ)
26757 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26758
26759 // We add rounding mode to the Node when
26760 // - RC Opcode is specified and
26761 // - RC is not "current direction".
26762 SDValue NewOp;
26763 if (IntrData->Opc1 != 0) {
26764 SDValue Rnd = Op.getOperand(5);
26765 unsigned RC = 0;
26766 if (isRoundModeSAEToX(Rnd, RC))
26767 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
26768 DAG.getTargetConstant(RC, dl, MVT::i32));
26769 else if (!isRoundModeCurDirection(Rnd))
26770 return SDValue();
26771 }
26772 if (!NewOp)
26773 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
26774 if (IntrData->Opc0 == X86ISD::VFMADDCSH ||
26775 IntrData->Opc0 == X86ISD::VFCMADDCSH)
26776 return getScalarMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26777 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26778 }
26779 case IFMA_OP:
26780 // NOTE: We need to swizzle the operands to pass the multiply operands
26781 // first.
26782 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26783 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
26784 case FPCLASSS: {
26785 SDValue Src1 = Op.getOperand(1);
26786 SDValue Imm = Op.getOperand(2);
26787 SDValue Mask = Op.getOperand(3);
26788 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
26789 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
26790 Subtarget, DAG);
26791 // Need to fill with zeros to ensure the bitcast will produce zeroes
26792 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26793 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26794 DAG.getConstant(0, dl, MVT::v8i1), FPclassMask,
26795 DAG.getVectorIdxConstant(0, dl));
26796 return DAG.getBitcast(MVT::i8, Ins);
26797 }
26798
26799 case CMP_MASK_CC: {
26800 MVT MaskVT = Op.getSimpleValueType();
26801 SDValue CC = Op.getOperand(3);
26802 SDValue Mask = Op.getOperand(4);
26803 // We specify 2 possible opcodes for intrinsics with rounding modes.
26804 // First, we check if the intrinsic may have non-default rounding mode,
26805 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26806 if (IntrData->Opc1 != 0) {
26807 SDValue Sae = Op.getOperand(5);
26808 if (isRoundModeSAE(Sae))
26809 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
26810 Op.getOperand(2), CC, Mask, Sae);
26811 if (!isRoundModeCurDirection(Sae))
26812 return SDValue();
26813 }
26814 //default rounding mode
26815 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
26816 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
26817 }
26818 case CMP_MASK_SCALAR_CC: {
26819 SDValue Src1 = Op.getOperand(1);
26820 SDValue Src2 = Op.getOperand(2);
26821 SDValue CC = Op.getOperand(3);
26822 SDValue Mask = Op.getOperand(4);
26823
26824 SDValue Cmp;
26825 if (IntrData->Opc1 != 0) {
26826 SDValue Sae = Op.getOperand(5);
26827 if (isRoundModeSAE(Sae))
26828 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
26829 else if (!isRoundModeCurDirection(Sae))
26830 return SDValue();
26831 }
26832 //default rounding mode
26833 if (!Cmp.getNode())
26834 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
26835
26836 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
26837 Subtarget, DAG);
26838 // Need to fill with zeros to ensure the bitcast will produce zeroes
26839 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26840 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26841 DAG.getConstant(0, dl, MVT::v8i1), CmpMask,
26842 DAG.getVectorIdxConstant(0, dl));
26843 return DAG.getBitcast(MVT::i8, Ins);
26844 }
26845 case COMI: { // Comparison intrinsics
26846 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
26847 SDValue LHS = Op.getOperand(1);
26848 SDValue RHS = Op.getOperand(2);
26849 // Some conditions require the operands to be swapped.
26850 if (CC == ISD::SETLT || CC == ISD::SETLE)
26851 std::swap(LHS, RHS);
26852
26853 // For AVX10.2, Support EQ and NE.
26854 bool HasAVX10_2_COMX =
26855 Subtarget.hasAVX10_2() && (CC == ISD::SETEQ || CC == ISD::SETNE);
26856
26857 // AVX10.2 COMPARE supports only v2f64, v4f32 or v8f16.
26858 // For BF type we need to fall back.
26859 bool HasAVX10_2_COMX_Ty = (LHS.getSimpleValueType() != MVT::v8bf16);
26860
26861 auto ComiOpCode = IntrData->Opc0;
26862 auto isUnordered = (ComiOpCode == X86ISD::UCOMI);
26863
26864 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty)
26865 ComiOpCode = isUnordered ? X86ISD::UCOMX : X86ISD::COMX;
26866
26867 SDValue Comi = DAG.getNode(ComiOpCode, dl, MVT::i32, LHS, RHS);
26868
26869 SDValue SetCC;
26870 switch (CC) {
26871 case ISD::SETEQ: {
26872 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
26873 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 1
26874 break;
26875 // (ZF = 1 and PF = 0)
26876 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
26877 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
26878 break;
26879 }
26880 case ISD::SETNE: {
26881 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
26882 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 0
26883 break;
26884 // (ZF = 0 or PF = 1)
26885 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
26886 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
26887 break;
26888 }
26889 case ISD::SETGT: // (CF = 0 and ZF = 0)
26890 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
26891 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
26892 break;
26893 }
26894 case ISD::SETGE: // CF = 0
26895 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
26896 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
26897 break;
26898 default:
26899 llvm_unreachable("Unexpected illegal condition!");
26900 }
26901 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26902 }
26903 case COMI_RM: { // Comparison intrinsics with Sae
26904 SDValue LHS = Op.getOperand(1);
26905 SDValue RHS = Op.getOperand(2);
26906 unsigned CondVal = Op.getConstantOperandVal(3);
26907 SDValue Sae = Op.getOperand(4);
26908
26909 SDValue FCmp;
26910 if (isRoundModeCurDirection(Sae))
26911 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
26912 DAG.getTargetConstant(CondVal, dl, MVT::i8));
26913 else if (isRoundModeSAE(Sae))
26914 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
26915 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
26916 else
26917 return SDValue();
26918 // Need to fill with zeros to ensure the bitcast will produce zeroes
26919 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26920 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26921 DAG.getConstant(0, dl, MVT::v16i1), FCmp,
26922 DAG.getVectorIdxConstant(0, dl));
26923 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
26924 DAG.getBitcast(MVT::i16, Ins));
26925 }
26926 case VSHIFT: {
26927 SDValue SrcOp = Op.getOperand(1);
26928 SDValue ShAmt = Op.getOperand(2);
26929 assert(ShAmt.getValueType() == MVT::i32 &&
26930 "Unexpected VSHIFT amount type");
26931
26932 // Catch shift-by-constant.
26933 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
26934 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
26935 Op.getSimpleValueType(), SrcOp,
26936 CShAmt->getZExtValue(), DAG);
26937
26938 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26939 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26940 SrcOp, ShAmt, 0, Subtarget, DAG);
26941 }
26943 SDValue Mask = Op.getOperand(3);
26944 SDValue DataToCompress = Op.getOperand(1);
26945 SDValue PassThru = Op.getOperand(2);
26946 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
26947 return Op.getOperand(1);
26948
26949 // Avoid false dependency.
26950 if (PassThru.isUndef())
26951 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26952
26953 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26954 Mask);
26955 }
26956 case FIXUPIMM:
26957 case FIXUPIMM_MASKZ: {
26958 SDValue Src1 = Op.getOperand(1);
26959 SDValue Src2 = Op.getOperand(2);
26960 SDValue Src3 = Op.getOperand(3);
26961 SDValue Imm = Op.getOperand(4);
26962 SDValue Mask = Op.getOperand(5);
26963 SDValue Passthru = (IntrData->Type == FIXUPIMM)
26964 ? Src1
26965 : getZeroVector(VT, Subtarget, DAG, dl);
26966
26967 unsigned Opc = IntrData->Opc0;
26968 if (IntrData->Opc1 != 0) {
26969 SDValue Sae = Op.getOperand(6);
26970 if (isRoundModeSAE(Sae))
26971 Opc = IntrData->Opc1;
26972 else if (!isRoundModeCurDirection(Sae))
26973 return SDValue();
26974 }
26975
26976 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
26977
26979 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26980
26981 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26982 }
26983 case ROUNDP: {
26984 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26985 // Clear the upper bits of the rounding immediate so that the legacy
26986 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26987 uint64_t Round = Op.getConstantOperandVal(2);
26988 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26989 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26990 Op.getOperand(1), RoundingMode);
26991 }
26992 case ROUNDS: {
26993 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
26994 // Clear the upper bits of the rounding immediate so that the legacy
26995 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26996 uint64_t Round = Op.getConstantOperandVal(3);
26997 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26998 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26999 Op.getOperand(1), Op.getOperand(2), RoundingMode);
27000 }
27001 case BEXTRI: {
27002 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
27003
27004 uint64_t Imm = Op.getConstantOperandVal(2);
27005 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
27006 Op.getValueType());
27007 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27008 Op.getOperand(1), Control);
27009 }
27010 // ADC/SBB
27011 case ADX: {
27012 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
27013 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
27014
27015 SDValue Res;
27016 // If the carry in is zero, then we should just use ADD/SUB instead of
27017 // ADC/SBB.
27018 if (isNullConstant(Op.getOperand(1))) {
27019 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
27020 Op.getOperand(3));
27021 } else {
27022 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
27023 DAG.getAllOnesConstant(dl, MVT::i8));
27024 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
27025 Op.getOperand(3), GenCF.getValue(1));
27026 }
27027 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
27028 SDValue Results[] = { SetCC, Res };
27029 return DAG.getMergeValues(Results, dl);
27030 }
27031 case CVTPD2PS_MASK:
27032 case CVTPD2DQ_MASK:
27033 case CVTQQ2PS_MASK:
27034 case TRUNCATE_TO_REG: {
27035 SDValue Src = Op.getOperand(1);
27036 SDValue PassThru = Op.getOperand(2);
27037 SDValue Mask = Op.getOperand(3);
27038
27039 if (isAllOnesConstant(Mask))
27040 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27041
27042 MVT SrcVT = Src.getSimpleValueType();
27043 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27044 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27045 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27046 {Src, PassThru, Mask});
27047 }
27048 case TRUNCATE2_TO_REG: {
27049 SDValue Src = Op.getOperand(1);
27050 SDValue Src2 = Op.getOperand(2);
27051 SDValue PassThru = Op.getOperand(3);
27052 SDValue Mask = Op.getOperand(4);
27053
27054 if (isAllOnesConstant(Mask))
27055 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), {Src, Src2});
27056
27057 MVT Src2VT = Src2.getSimpleValueType();
27058 MVT MaskVT = MVT::getVectorVT(MVT::i1, Src2VT.getVectorNumElements());
27059 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27060 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27061 {Src, Src2, PassThru, Mask});
27062 }
27063 case CVTPS2PH_MASK: {
27064 SDValue Src = Op.getOperand(1);
27065 SDValue Rnd = Op.getOperand(2);
27066 SDValue PassThru = Op.getOperand(3);
27067 SDValue Mask = Op.getOperand(4);
27068
27069 unsigned RC = 0;
27070 unsigned Opc = IntrData->Opc0;
27071 bool SAE = Src.getValueType().is512BitVector() &&
27072 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
27073 if (SAE) {
27075 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
27076 }
27077
27078 if (isAllOnesConstant(Mask))
27079 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
27080
27081 if (SAE)
27083 else
27084 Opc = IntrData->Opc1;
27085 MVT SrcVT = Src.getSimpleValueType();
27086 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27087 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27088 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
27089 }
27090 case CVTNEPS2BF16_MASK: {
27091 SDValue Src = Op.getOperand(1);
27092 SDValue PassThru = Op.getOperand(2);
27093 SDValue Mask = Op.getOperand(3);
27094
27095 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
27096 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27097
27098 // Break false dependency.
27099 if (PassThru.isUndef())
27100 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
27101
27102 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
27103 Mask);
27104 }
27105 default:
27106 break;
27107 }
27108 }
27109
27110 switch (IntNo) {
27111 default: return SDValue(); // Don't custom lower most intrinsics.
27112
27113 // ptest and testp intrinsics. The intrinsic these come from are designed to
27114 // return an integer value, not just an instruction so lower it to the ptest
27115 // or testp pattern and a setcc for the result.
27116 case Intrinsic::x86_avx512_ktestc_b:
27117 case Intrinsic::x86_avx512_ktestc_w:
27118 case Intrinsic::x86_avx512_ktestc_d:
27119 case Intrinsic::x86_avx512_ktestc_q:
27120 case Intrinsic::x86_avx512_ktestz_b:
27121 case Intrinsic::x86_avx512_ktestz_w:
27122 case Intrinsic::x86_avx512_ktestz_d:
27123 case Intrinsic::x86_avx512_ktestz_q:
27124 case Intrinsic::x86_sse41_ptestz:
27125 case Intrinsic::x86_sse41_ptestc:
27126 case Intrinsic::x86_sse41_ptestnzc:
27127 case Intrinsic::x86_avx_ptestz_256:
27128 case Intrinsic::x86_avx_ptestc_256:
27129 case Intrinsic::x86_avx_ptestnzc_256:
27130 case Intrinsic::x86_avx_vtestz_ps:
27131 case Intrinsic::x86_avx_vtestc_ps:
27132 case Intrinsic::x86_avx_vtestnzc_ps:
27133 case Intrinsic::x86_avx_vtestz_pd:
27134 case Intrinsic::x86_avx_vtestc_pd:
27135 case Intrinsic::x86_avx_vtestnzc_pd:
27136 case Intrinsic::x86_avx_vtestz_ps_256:
27137 case Intrinsic::x86_avx_vtestc_ps_256:
27138 case Intrinsic::x86_avx_vtestnzc_ps_256:
27139 case Intrinsic::x86_avx_vtestz_pd_256:
27140 case Intrinsic::x86_avx_vtestc_pd_256:
27141 case Intrinsic::x86_avx_vtestnzc_pd_256: {
27142 unsigned TestOpc = X86ISD::PTEST;
27143 X86::CondCode X86CC;
27144 switch (IntNo) {
27145 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
27146 case Intrinsic::x86_avx512_ktestc_b:
27147 case Intrinsic::x86_avx512_ktestc_w:
27148 case Intrinsic::x86_avx512_ktestc_d:
27149 case Intrinsic::x86_avx512_ktestc_q:
27150 // CF = 1
27151 TestOpc = X86ISD::KTEST;
27152 X86CC = X86::COND_B;
27153 break;
27154 case Intrinsic::x86_avx512_ktestz_b:
27155 case Intrinsic::x86_avx512_ktestz_w:
27156 case Intrinsic::x86_avx512_ktestz_d:
27157 case Intrinsic::x86_avx512_ktestz_q:
27158 TestOpc = X86ISD::KTEST;
27159 X86CC = X86::COND_E;
27160 break;
27161 case Intrinsic::x86_avx_vtestz_ps:
27162 case Intrinsic::x86_avx_vtestz_pd:
27163 case Intrinsic::x86_avx_vtestz_ps_256:
27164 case Intrinsic::x86_avx_vtestz_pd_256:
27165 TestOpc = X86ISD::TESTP;
27166 [[fallthrough]];
27167 case Intrinsic::x86_sse41_ptestz:
27168 case Intrinsic::x86_avx_ptestz_256:
27169 // ZF = 1
27170 X86CC = X86::COND_E;
27171 break;
27172 case Intrinsic::x86_avx_vtestc_ps:
27173 case Intrinsic::x86_avx_vtestc_pd:
27174 case Intrinsic::x86_avx_vtestc_ps_256:
27175 case Intrinsic::x86_avx_vtestc_pd_256:
27176 TestOpc = X86ISD::TESTP;
27177 [[fallthrough]];
27178 case Intrinsic::x86_sse41_ptestc:
27179 case Intrinsic::x86_avx_ptestc_256:
27180 // CF = 1
27181 X86CC = X86::COND_B;
27182 break;
27183 case Intrinsic::x86_avx_vtestnzc_ps:
27184 case Intrinsic::x86_avx_vtestnzc_pd:
27185 case Intrinsic::x86_avx_vtestnzc_ps_256:
27186 case Intrinsic::x86_avx_vtestnzc_pd_256:
27187 TestOpc = X86ISD::TESTP;
27188 [[fallthrough]];
27189 case Intrinsic::x86_sse41_ptestnzc:
27190 case Intrinsic::x86_avx_ptestnzc_256:
27191 // ZF and CF = 0
27192 X86CC = X86::COND_A;
27193 break;
27194 }
27195
27196 SDValue LHS = Op.getOperand(1);
27197 SDValue RHS = Op.getOperand(2);
27198 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
27199 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
27200 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27201 }
27202
27203 case Intrinsic::x86_sse42_pcmpistria128:
27204 case Intrinsic::x86_sse42_pcmpestria128:
27205 case Intrinsic::x86_sse42_pcmpistric128:
27206 case Intrinsic::x86_sse42_pcmpestric128:
27207 case Intrinsic::x86_sse42_pcmpistrio128:
27208 case Intrinsic::x86_sse42_pcmpestrio128:
27209 case Intrinsic::x86_sse42_pcmpistris128:
27210 case Intrinsic::x86_sse42_pcmpestris128:
27211 case Intrinsic::x86_sse42_pcmpistriz128:
27212 case Intrinsic::x86_sse42_pcmpestriz128: {
27213 unsigned Opcode;
27214 X86::CondCode X86CC;
27215 switch (IntNo) {
27216 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27217 case Intrinsic::x86_sse42_pcmpistria128:
27218 Opcode = X86ISD::PCMPISTR;
27219 X86CC = X86::COND_A;
27220 break;
27221 case Intrinsic::x86_sse42_pcmpestria128:
27222 Opcode = X86ISD::PCMPESTR;
27223 X86CC = X86::COND_A;
27224 break;
27225 case Intrinsic::x86_sse42_pcmpistric128:
27226 Opcode = X86ISD::PCMPISTR;
27227 X86CC = X86::COND_B;
27228 break;
27229 case Intrinsic::x86_sse42_pcmpestric128:
27230 Opcode = X86ISD::PCMPESTR;
27231 X86CC = X86::COND_B;
27232 break;
27233 case Intrinsic::x86_sse42_pcmpistrio128:
27234 Opcode = X86ISD::PCMPISTR;
27235 X86CC = X86::COND_O;
27236 break;
27237 case Intrinsic::x86_sse42_pcmpestrio128:
27238 Opcode = X86ISD::PCMPESTR;
27239 X86CC = X86::COND_O;
27240 break;
27241 case Intrinsic::x86_sse42_pcmpistris128:
27242 Opcode = X86ISD::PCMPISTR;
27243 X86CC = X86::COND_S;
27244 break;
27245 case Intrinsic::x86_sse42_pcmpestris128:
27246 Opcode = X86ISD::PCMPESTR;
27247 X86CC = X86::COND_S;
27248 break;
27249 case Intrinsic::x86_sse42_pcmpistriz128:
27250 Opcode = X86ISD::PCMPISTR;
27251 X86CC = X86::COND_E;
27252 break;
27253 case Intrinsic::x86_sse42_pcmpestriz128:
27254 Opcode = X86ISD::PCMPESTR;
27255 X86CC = X86::COND_E;
27256 break;
27257 }
27259 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27260 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
27261 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
27262 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27263 }
27264
27265 case Intrinsic::x86_sse42_pcmpistri128:
27266 case Intrinsic::x86_sse42_pcmpestri128: {
27267 unsigned Opcode;
27268 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
27269 Opcode = X86ISD::PCMPISTR;
27270 else
27271 Opcode = X86ISD::PCMPESTR;
27272
27274 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27275 return DAG.getNode(Opcode, dl, VTs, NewOps);
27276 }
27277
27278 case Intrinsic::x86_sse42_pcmpistrm128:
27279 case Intrinsic::x86_sse42_pcmpestrm128: {
27280 unsigned Opcode;
27281 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
27282 Opcode = X86ISD::PCMPISTR;
27283 else
27284 Opcode = X86ISD::PCMPESTR;
27285
27287 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27288 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
27289 }
27290
27291 case Intrinsic::eh_sjlj_lsda: {
27292 MachineFunction &MF = DAG.getMachineFunction();
27293 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27294 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
27295 auto &Context = MF.getContext();
27296 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
27297 Twine(MF.getFunctionNumber()));
27298 return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
27299 DAG.getMCSymbol(S, PtrVT));
27300 }
27301
27302 case Intrinsic::x86_seh_lsda: {
27303 // Compute the symbol for the LSDA. We know it'll get emitted later.
27304 MachineFunction &MF = DAG.getMachineFunction();
27305 SDValue Op1 = Op.getOperand(1);
27306 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
27309
27310 // Generate a simple absolute symbol reference. This intrinsic is only
27311 // supported on 32-bit Windows, which isn't PIC.
27312 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
27313 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
27314 }
27315
27316 case Intrinsic::eh_recoverfp: {
27317 SDValue FnOp = Op.getOperand(1);
27318 SDValue IncomingFPOp = Op.getOperand(2);
27319 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
27320 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
27321 if (!Fn)
27323 "llvm.eh.recoverfp must take a function as the first argument");
27324 return recoverFramePointer(DAG, Fn, IncomingFPOp);
27325 }
27326
27327 case Intrinsic::localaddress: {
27328 // Returns one of the stack, base, or frame pointer registers, depending on
27329 // which is used to reference local variables.
27330 MachineFunction &MF = DAG.getMachineFunction();
27331 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27332 Register Reg;
27333 if (RegInfo->hasBasePointer(MF))
27334 Reg = RegInfo->getBaseRegister();
27335 else { // Handles the SP or FP case.
27336 bool CantUseFP = RegInfo->hasStackRealignment(MF);
27337 if (CantUseFP)
27338 Reg = RegInfo->getPtrSizedStackRegister(MF);
27339 else
27340 Reg = RegInfo->getPtrSizedFrameRegister(MF);
27341 }
27342 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
27343 }
27344 case Intrinsic::x86_avx512_vp2intersect_q_512:
27345 case Intrinsic::x86_avx512_vp2intersect_q_256:
27346 case Intrinsic::x86_avx512_vp2intersect_q_128:
27347 case Intrinsic::x86_avx512_vp2intersect_d_512:
27348 case Intrinsic::x86_avx512_vp2intersect_d_256:
27349 case Intrinsic::x86_avx512_vp2intersect_d_128: {
27350 SDLoc DL(Op);
27351 MVT MaskVT = Op.getSimpleValueType();
27352 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27354 Op.getOperand(1), Op.getOperand(2));
27355 SDValue Result0 =
27356 DAG.getTargetExtractSubreg(X86::sub_mask_0, DL, MaskVT, Operation);
27357 SDValue Result1 =
27358 DAG.getTargetExtractSubreg(X86::sub_mask_1, DL, MaskVT, Operation);
27359 return DAG.getMergeValues({Result0, Result1}, DL);
27360 }
27361 case Intrinsic::x86_mmx_pslli_w:
27362 case Intrinsic::x86_mmx_pslli_d:
27363 case Intrinsic::x86_mmx_pslli_q:
27364 case Intrinsic::x86_mmx_psrli_w:
27365 case Intrinsic::x86_mmx_psrli_d:
27366 case Intrinsic::x86_mmx_psrli_q:
27367 case Intrinsic::x86_mmx_psrai_w:
27368 case Intrinsic::x86_mmx_psrai_d: {
27369 SDLoc DL(Op);
27370 SDValue ShAmt = Op.getOperand(2);
27371 // If the argument is a constant, convert it to a target constant.
27372 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
27373 // Clamp out of bounds shift amounts since they will otherwise be masked
27374 // to 8-bits which may make it no longer out of bounds.
27375 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
27376 if (ShiftAmount == 0)
27377 return Op.getOperand(1);
27378
27379 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27380 Op.getOperand(0), Op.getOperand(1),
27381 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
27382 }
27383
27384 unsigned NewIntrinsic;
27385 switch (IntNo) {
27386 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27387 case Intrinsic::x86_mmx_pslli_w:
27388 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
27389 break;
27390 case Intrinsic::x86_mmx_pslli_d:
27391 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
27392 break;
27393 case Intrinsic::x86_mmx_pslli_q:
27394 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
27395 break;
27396 case Intrinsic::x86_mmx_psrli_w:
27397 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
27398 break;
27399 case Intrinsic::x86_mmx_psrli_d:
27400 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
27401 break;
27402 case Intrinsic::x86_mmx_psrli_q:
27403 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
27404 break;
27405 case Intrinsic::x86_mmx_psrai_w:
27406 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
27407 break;
27408 case Intrinsic::x86_mmx_psrai_d:
27409 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
27410 break;
27411 }
27412
27413 // The vector shift intrinsics with scalars uses 32b shift amounts but
27414 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
27415 // MMX register.
27416 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
27417 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27418 DAG.getTargetConstant(NewIntrinsic, DL,
27420 Op.getOperand(1), ShAmt);
27421 }
27422 case Intrinsic::thread_pointer: {
27423 if (Subtarget.isTargetELF()) {
27424 SDLoc dl(Op);
27425 EVT PtrVT = Op.getValueType();
27426 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
27428 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
27429 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27430 DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));
27431 }
27433 "Target OS doesn't support __builtin_thread_pointer() yet.");
27434 }
27435 }
27436}
27437
27439 SDValue Src, SDValue Mask, SDValue Base,
27440 SDValue Index, SDValue ScaleOp, SDValue Chain,
27441 const X86Subtarget &Subtarget) {
27442 SDLoc dl(Op);
27443 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27444 // Scale must be constant.
27445 if (!C)
27446 return SDValue();
27447 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27448 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27449 TLI.getPointerTy(DAG.getDataLayout()));
27450 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
27451 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27452 // If source is undef or we know it won't be used, use a zero vector
27453 // to break register dependency.
27454 // TODO: use undef instead and let BreakFalseDeps deal with it?
27455 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27456 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27457
27458 // Cast mask to an integer type.
27459 Mask = DAG.getBitcast(MaskVT, Mask);
27460
27462
27463 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27464 SDValue Res =
27466 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27467 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27468}
27469
27471 SDValue Src, SDValue Mask, SDValue Base,
27472 SDValue Index, SDValue ScaleOp, SDValue Chain,
27473 const X86Subtarget &Subtarget) {
27474 MVT VT = Op.getSimpleValueType();
27475 SDLoc dl(Op);
27476 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27477 // Scale must be constant.
27478 if (!C)
27479 return SDValue();
27480 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27481 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27482 TLI.getPointerTy(DAG.getDataLayout()));
27483 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27485 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27486
27487 // We support two versions of the gather intrinsics. One with scalar mask and
27488 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27489 if (Mask.getValueType() != MaskVT)
27490 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27491
27492 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27493 // If source is undef or we know it won't be used, use a zero vector
27494 // to break register dependency.
27495 // TODO: use undef instead and let BreakFalseDeps deal with it?
27496 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27497 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27498
27500
27501 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27502 SDValue Res =
27504 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27505 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27506}
27507
27509 SDValue Src, SDValue Mask, SDValue Base,
27510 SDValue Index, SDValue ScaleOp, SDValue Chain,
27511 const X86Subtarget &Subtarget) {
27512 SDLoc dl(Op);
27513 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27514 // Scale must be constant.
27515 if (!C)
27516 return SDValue();
27517 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27518 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27519 TLI.getPointerTy(DAG.getDataLayout()));
27520 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27521 Src.getSimpleValueType().getVectorNumElements());
27522 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27523
27524 // We support two versions of the scatter intrinsics. One with scalar mask and
27525 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27526 if (Mask.getValueType() != MaskVT)
27527 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27528
27530
27531 SDVTList VTs = DAG.getVTList(MVT::Other);
27532 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
27533 SDValue Res =
27535 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27536 return Res;
27537}
27538
27540 SDValue Mask, SDValue Base, SDValue Index,
27541 SDValue ScaleOp, SDValue Chain,
27542 const X86Subtarget &Subtarget) {
27543 SDLoc dl(Op);
27544 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27545 // Scale must be constant.
27546 if (!C)
27547 return SDValue();
27548 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27549 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27550 TLI.getPointerTy(DAG.getDataLayout()));
27551 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
27552 SDValue Segment = DAG.getRegister(0, MVT::i32);
27553 MVT MaskVT =
27554 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
27555 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27556 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
27557 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
27558 return SDValue(Res, 0);
27559}
27560
27561/// Handles the lowering of builtin intrinsics with chain that return their
27562/// value into registers EDX:EAX.
27563/// If operand ScrReg is a valid register identifier, then operand 2 of N is
27564/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
27565/// TargetOpcode.
27566/// Returns a Glue value which can be used to add extra copy-from-reg if the
27567/// expanded intrinsics implicitly defines extra registers (i.e. not just
27568/// EDX:EAX).
27570 SelectionDAG &DAG,
27571 unsigned TargetOpcode,
27572 unsigned SrcReg,
27573 const X86Subtarget &Subtarget,
27575 SDValue Chain = N->getOperand(0);
27576 SDValue Glue;
27577
27578 if (SrcReg) {
27579 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
27580 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
27581 Glue = Chain.getValue(1);
27582 }
27583
27584 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
27585 SDValue N1Ops[] = {Chain, Glue};
27586 SDNode *N1 = DAG.getMachineNode(
27587 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
27588 Chain = SDValue(N1, 0);
27589
27590 // Reads the content of XCR and returns it in registers EDX:EAX.
27591 SDValue LO, HI;
27592 if (Subtarget.is64Bit()) {
27593 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
27594 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
27595 LO.getValue(2));
27596 } else {
27597 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
27598 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
27599 LO.getValue(2));
27600 }
27601 Chain = HI.getValue(1);
27602 Glue = HI.getValue(2);
27603
27604 if (Subtarget.is64Bit()) {
27605 // Merge the two 32-bit values into a 64-bit one.
27606 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
27607 DAG.getConstant(32, DL, MVT::i8));
27608 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
27609 Results.push_back(Chain);
27610 return Glue;
27611 }
27612
27613 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
27614 SDValue Ops[] = { LO, HI };
27615 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
27616 Results.push_back(Pair);
27617 Results.push_back(Chain);
27618 return Glue;
27619}
27620
27621/// Handles the lowering of builtin intrinsics that read the time stamp counter
27622/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
27623/// READCYCLECOUNTER nodes.
27624static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
27625 SelectionDAG &DAG,
27626 const X86Subtarget &Subtarget,
27628 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
27629 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
27630 // and the EAX register is loaded with the low-order 32 bits.
27631 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
27632 /* NoRegister */0, Subtarget,
27633 Results);
27634 if (Opcode != X86::RDTSCP)
27635 return;
27636
27637 SDValue Chain = Results[1];
27638 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
27639 // the ECX register. Add 'ecx' explicitly to the chain.
27640 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
27641 Results[1] = ecx;
27642 Results.push_back(ecx.getValue(1));
27643}
27644
27646 SelectionDAG &DAG) {
27648 SDLoc DL(Op);
27649 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
27650 Results);
27651 return DAG.getMergeValues(Results, DL);
27652}
27653
27656 SDValue Chain = Op.getOperand(0);
27657 SDValue RegNode = Op.getOperand(2);
27658 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27659 if (!EHInfo)
27660 report_fatal_error("EH registrations only live in functions using WinEH");
27661
27662 // Cast the operand to an alloca, and remember the frame index.
27663 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
27664 if (!FINode)
27665 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
27666 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
27667
27668 // Return the chain operand without making any DAG nodes.
27669 return Chain;
27670}
27671
27674 SDValue Chain = Op.getOperand(0);
27675 SDValue EHGuard = Op.getOperand(2);
27676 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27677 if (!EHInfo)
27678 report_fatal_error("EHGuard only live in functions using WinEH");
27679
27680 // Cast the operand to an alloca, and remember the frame index.
27681 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
27682 if (!FINode)
27683 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
27684 EHInfo->EHGuardFrameIndex = FINode->getIndex();
27685
27686 // Return the chain operand without making any DAG nodes.
27687 return Chain;
27688}
27689
27690/// Emit Truncating Store with signed or unsigned saturation.
27691static SDValue
27692EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
27693 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
27694 SelectionDAG &DAG) {
27695 SDVTList VTs = DAG.getVTList(MVT::Other);
27696 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
27697 SDValue Ops[] = { Chain, Val, Ptr, Undef };
27698 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
27699 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27700}
27701
27702/// Emit Masked Truncating Store with signed or unsigned saturation.
27703static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
27704 const SDLoc &DL,
27705 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
27706 MachineMemOperand *MMO, SelectionDAG &DAG) {
27707 SDVTList VTs = DAG.getVTList(MVT::Other);
27708 SDValue Ops[] = { Chain, Val, Ptr, Mask };
27709 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
27710 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27711}
27712
27714 const MachineFunction &MF) {
27715 if (!Subtarget.is64Bit())
27716 return false;
27717 // 64-bit targets support extended Swift async frame setup,
27718 // except for targets that use the windows 64 prologue.
27719 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
27720}
27721
27723 SelectionDAG &DAG) {
27724 unsigned IntNo = Op.getConstantOperandVal(1);
27725 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
27726 if (!IntrData) {
27727 switch (IntNo) {
27728
27729 case Intrinsic::swift_async_context_addr: {
27730 SDLoc dl(Op);
27731 auto &MF = DAG.getMachineFunction();
27732 auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
27733 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
27735 X86FI->setHasSwiftAsyncContext(true);
27736 SDValue Chain = Op->getOperand(0);
27737 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
27738 SDValue Result =
27739 SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
27740 DAG.getTargetConstant(8, dl, MVT::i32)),
27741 0);
27742 // Return { result, chain }.
27743 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27744 CopyRBP.getValue(1));
27745 } else {
27746 // No special extended frame, create or reuse an existing stack slot.
27747 int PtrSize = Subtarget.is64Bit() ? 8 : 4;
27748 if (!X86FI->getSwiftAsyncContextFrameIdx())
27749 X86FI->setSwiftAsyncContextFrameIdx(
27750 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize),
27751 false));
27752 SDValue Result =
27753 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
27754 PtrSize == 8 ? MVT::i64 : MVT::i32);
27755 // Return { result, chain }.
27756 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27757 Op->getOperand(0));
27758 }
27759 }
27760
27761 case llvm::Intrinsic::x86_seh_ehregnode:
27762 return MarkEHRegistrationNode(Op, DAG);
27763 case llvm::Intrinsic::x86_seh_ehguard:
27764 return MarkEHGuard(Op, DAG);
27765 case llvm::Intrinsic::x86_rdpkru: {
27766 SDLoc dl(Op);
27767 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27768 // Create a RDPKRU node and pass 0 to the ECX parameter.
27769 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
27770 DAG.getConstant(0, dl, MVT::i32));
27771 }
27772 case llvm::Intrinsic::x86_wrpkru: {
27773 SDLoc dl(Op);
27774 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
27775 // to the EDX and ECX parameters.
27776 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
27777 Op.getOperand(0), Op.getOperand(2),
27778 DAG.getConstant(0, dl, MVT::i32),
27779 DAG.getConstant(0, dl, MVT::i32));
27780 }
27781 case llvm::Intrinsic::asan_check_memaccess: {
27782 // Mark this as adjustsStack because it will be lowered to a call.
27784 // Don't do anything here, we will expand these intrinsics out later.
27785 return Op;
27786 }
27787 case llvm::Intrinsic::x86_flags_read_u32:
27788 case llvm::Intrinsic::x86_flags_read_u64:
27789 case llvm::Intrinsic::x86_flags_write_u32:
27790 case llvm::Intrinsic::x86_flags_write_u64: {
27791 // We need a frame pointer because this will get lowered to a PUSH/POP
27792 // sequence.
27795 // Don't do anything here, we will expand these intrinsics out later
27796 // during FinalizeISel in EmitInstrWithCustomInserter.
27797 return Op;
27798 }
27799 case Intrinsic::x86_lwpins32:
27800 case Intrinsic::x86_lwpins64:
27801 case Intrinsic::x86_umwait:
27802 case Intrinsic::x86_tpause: {
27803 SDLoc dl(Op);
27804 SDValue Chain = Op->getOperand(0);
27805 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27806 unsigned Opcode;
27807
27808 switch (IntNo) {
27809 default: llvm_unreachable("Impossible intrinsic");
27810 case Intrinsic::x86_umwait:
27811 Opcode = X86ISD::UMWAIT;
27812 break;
27813 case Intrinsic::x86_tpause:
27814 Opcode = X86ISD::TPAUSE;
27815 break;
27816 case Intrinsic::x86_lwpins32:
27817 case Intrinsic::x86_lwpins64:
27818 Opcode = X86ISD::LWPINS;
27819 break;
27820 }
27821
27823 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
27824 Op->getOperand(3), Op->getOperand(4));
27825 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27826 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27827 Operation.getValue(1));
27828 }
27829 case Intrinsic::x86_enqcmd:
27830 case Intrinsic::x86_enqcmds: {
27831 SDLoc dl(Op);
27832 SDValue Chain = Op.getOperand(0);
27833 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27834 unsigned Opcode;
27835 switch (IntNo) {
27836 default: llvm_unreachable("Impossible intrinsic!");
27837 case Intrinsic::x86_enqcmd:
27838 Opcode = X86ISD::ENQCMD;
27839 break;
27840 case Intrinsic::x86_enqcmds:
27841 Opcode = X86ISD::ENQCMDS;
27842 break;
27843 }
27844 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
27845 Op.getOperand(3));
27846 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
27847 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27848 Operation.getValue(1));
27849 }
27850 case Intrinsic::x86_aesenc128kl:
27851 case Intrinsic::x86_aesdec128kl:
27852 case Intrinsic::x86_aesenc256kl:
27853 case Intrinsic::x86_aesdec256kl: {
27854 SDLoc DL(Op);
27855 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
27856 SDValue Chain = Op.getOperand(0);
27857 unsigned Opcode;
27858
27859 switch (IntNo) {
27860 default: llvm_unreachable("Impossible intrinsic");
27861 case Intrinsic::x86_aesenc128kl:
27862 Opcode = X86ISD::AESENC128KL;
27863 break;
27864 case Intrinsic::x86_aesdec128kl:
27865 Opcode = X86ISD::AESDEC128KL;
27866 break;
27867 case Intrinsic::x86_aesenc256kl:
27868 Opcode = X86ISD::AESENC256KL;
27869 break;
27870 case Intrinsic::x86_aesdec256kl:
27871 Opcode = X86ISD::AESDEC256KL;
27872 break;
27873 }
27874
27876 MachineMemOperand *MMO = MemIntr->getMemOperand();
27877 EVT MemVT = MemIntr->getMemoryVT();
27879 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
27880 MMO);
27881 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
27882
27883 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27884 {ZF, Operation.getValue(0), Operation.getValue(2)});
27885 }
27886 case Intrinsic::x86_aesencwide128kl:
27887 case Intrinsic::x86_aesdecwide128kl:
27888 case Intrinsic::x86_aesencwide256kl:
27889 case Intrinsic::x86_aesdecwide256kl: {
27890 SDLoc DL(Op);
27891 SDVTList VTs = DAG.getVTList(
27892 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
27893 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
27894 SDValue Chain = Op.getOperand(0);
27895 unsigned Opcode;
27896
27897 switch (IntNo) {
27898 default: llvm_unreachable("Impossible intrinsic");
27899 case Intrinsic::x86_aesencwide128kl:
27900 Opcode = X86ISD::AESENCWIDE128KL;
27901 break;
27902 case Intrinsic::x86_aesdecwide128kl:
27903 Opcode = X86ISD::AESDECWIDE128KL;
27904 break;
27905 case Intrinsic::x86_aesencwide256kl:
27906 Opcode = X86ISD::AESENCWIDE256KL;
27907 break;
27908 case Intrinsic::x86_aesdecwide256kl:
27909 Opcode = X86ISD::AESDECWIDE256KL;
27910 break;
27911 }
27912
27914 MachineMemOperand *MMO = MemIntr->getMemOperand();
27915 EVT MemVT = MemIntr->getMemoryVT();
27917 Opcode, DL, VTs,
27918 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
27919 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
27920 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
27921 MemVT, MMO);
27922 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
27923
27924 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27925 {ZF, Operation.getValue(1), Operation.getValue(2),
27926 Operation.getValue(3), Operation.getValue(4),
27927 Operation.getValue(5), Operation.getValue(6),
27928 Operation.getValue(7), Operation.getValue(8),
27929 Operation.getValue(9)});
27930 }
27931 case Intrinsic::x86_testui: {
27932 SDLoc dl(Op);
27933 SDValue Chain = Op.getOperand(0);
27934 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27935 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
27936 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27937 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27938 Operation.getValue(1));
27939 }
27940 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27941 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27942 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27943 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27944 case Intrinsic::x86_t2rpntlvwz0_internal:
27945 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27946 case Intrinsic::x86_t2rpntlvwz1_internal:
27947 case Intrinsic::x86_t2rpntlvwz1t1_internal: {
27948 auto *X86MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
27950 unsigned IntNo = Op.getConstantOperandVal(1);
27951 unsigned Opc = 0;
27952 switch (IntNo) {
27953 default:
27954 llvm_unreachable("Unexpected intrinsic!");
27955 case Intrinsic::x86_t2rpntlvwz0_internal:
27956 Opc = X86::PT2RPNTLVWZ0V;
27957 break;
27958 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27959 Opc = X86::PT2RPNTLVWZ0T1V;
27960 break;
27961 case Intrinsic::x86_t2rpntlvwz1_internal:
27962 Opc = X86::PT2RPNTLVWZ1V;
27963 break;
27964 case Intrinsic::x86_t2rpntlvwz1t1_internal:
27965 Opc = X86::PT2RPNTLVWZ1T1V;
27966 break;
27967 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27968 Opc = X86::PT2RPNTLVWZ0RSV;
27969 break;
27970 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27971 Opc = X86::PT2RPNTLVWZ0RST1V;
27972 break;
27973 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27974 Opc = X86::PT2RPNTLVWZ1RSV;
27975 break;
27976 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27977 Opc = X86::PT2RPNTLVWZ1RST1V;
27978 break;
27979 }
27980
27981 SDLoc DL(Op);
27982 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27983
27984 SDValue Ops[] = {Op.getOperand(2), // Row
27985 Op.getOperand(3), // Col0
27986 Op.getOperand(4), // Col1
27987 Op.getOperand(5), // Base
27988 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
27989 Op.getOperand(6), // Index
27990 DAG.getTargetConstant(0, DL, MVT::i32), // Disp
27991 DAG.getRegister(0, MVT::i16), // Segment
27992 Op.getOperand(0)}; // Chain
27993
27994 MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops);
27995 SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx,
27996 SDValue(Res, 0));
27997 SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx,
27998 SDValue(Res, 0));
27999 return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL);
28000 }
28001 case Intrinsic::x86_atomic_bts_rm:
28002 case Intrinsic::x86_atomic_btc_rm:
28003 case Intrinsic::x86_atomic_btr_rm: {
28004 SDLoc DL(Op);
28005 MVT VT = Op.getSimpleValueType();
28006 SDValue Chain = Op.getOperand(0);
28007 SDValue Op1 = Op.getOperand(2);
28008 SDValue Op2 = Op.getOperand(3);
28009 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
28010 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
28012 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28013 SDValue Res =
28014 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28015 {Chain, Op1, Op2}, VT, MMO);
28016 Chain = Res.getValue(1);
28017 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28018 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28019 }
28020 case Intrinsic::x86_atomic_bts:
28021 case Intrinsic::x86_atomic_btc:
28022 case Intrinsic::x86_atomic_btr: {
28023 SDLoc DL(Op);
28024 MVT VT = Op.getSimpleValueType();
28025 SDValue Chain = Op.getOperand(0);
28026 SDValue Op1 = Op.getOperand(2);
28027 SDValue Op2 = Op.getOperand(3);
28028 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
28029 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
28030 : X86ISD::LBTR;
28031 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
28032 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28033 SDValue Res =
28034 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28035 {Chain, Op1, Op2, Size}, VT, MMO);
28036 Chain = Res.getValue(1);
28037 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28038 unsigned Imm = Op2->getAsZExtVal();
28039 if (Imm)
28040 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
28041 DAG.getShiftAmountConstant(Imm, VT, DL));
28042 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28043 }
28044 case Intrinsic::x86_cmpccxadd32:
28045 case Intrinsic::x86_cmpccxadd64: {
28046 SDLoc DL(Op);
28047 SDValue Chain = Op.getOperand(0);
28048 SDValue Addr = Op.getOperand(2);
28049 SDValue Src1 = Op.getOperand(3);
28050 SDValue Src2 = Op.getOperand(4);
28051 SDValue CC = Op.getOperand(5);
28052 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28054 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
28055 MVT::i32, MMO);
28056 return Operation;
28057 }
28058 case Intrinsic::x86_aadd32:
28059 case Intrinsic::x86_aadd64:
28060 case Intrinsic::x86_aand32:
28061 case Intrinsic::x86_aand64:
28062 case Intrinsic::x86_aor32:
28063 case Intrinsic::x86_aor64:
28064 case Intrinsic::x86_axor32:
28065 case Intrinsic::x86_axor64: {
28066 SDLoc DL(Op);
28067 SDValue Chain = Op.getOperand(0);
28068 SDValue Op1 = Op.getOperand(2);
28069 SDValue Op2 = Op.getOperand(3);
28070 MVT VT = Op2.getSimpleValueType();
28071 unsigned Opc = 0;
28072 switch (IntNo) {
28073 default:
28074 llvm_unreachable("Unknown Intrinsic");
28075 case Intrinsic::x86_aadd32:
28076 case Intrinsic::x86_aadd64:
28077 Opc = X86ISD::AADD;
28078 break;
28079 case Intrinsic::x86_aand32:
28080 case Intrinsic::x86_aand64:
28081 Opc = X86ISD::AAND;
28082 break;
28083 case Intrinsic::x86_aor32:
28084 case Intrinsic::x86_aor64:
28085 Opc = X86ISD::AOR;
28086 break;
28087 case Intrinsic::x86_axor32:
28088 case Intrinsic::x86_axor64:
28089 Opc = X86ISD::AXOR;
28090 break;
28091 }
28092 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
28093 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
28094 {Chain, Op1, Op2}, VT, MMO);
28095 }
28096 case Intrinsic::x86_atomic_add_cc:
28097 case Intrinsic::x86_atomic_sub_cc:
28098 case Intrinsic::x86_atomic_or_cc:
28099 case Intrinsic::x86_atomic_and_cc:
28100 case Intrinsic::x86_atomic_xor_cc: {
28101 SDLoc DL(Op);
28102 SDValue Chain = Op.getOperand(0);
28103 SDValue Op1 = Op.getOperand(2);
28104 SDValue Op2 = Op.getOperand(3);
28105 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
28106 MVT VT = Op2.getSimpleValueType();
28107 unsigned Opc = 0;
28108 switch (IntNo) {
28109 default:
28110 llvm_unreachable("Unknown Intrinsic");
28111 case Intrinsic::x86_atomic_add_cc:
28112 Opc = X86ISD::LADD;
28113 break;
28114 case Intrinsic::x86_atomic_sub_cc:
28115 Opc = X86ISD::LSUB;
28116 break;
28117 case Intrinsic::x86_atomic_or_cc:
28118 Opc = X86ISD::LOR;
28119 break;
28120 case Intrinsic::x86_atomic_and_cc:
28121 Opc = X86ISD::LAND;
28122 break;
28123 case Intrinsic::x86_atomic_xor_cc:
28124 Opc = X86ISD::LXOR;
28125 break;
28126 }
28127 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28128 SDValue LockArith =
28129 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28130 {Chain, Op1, Op2}, VT, MMO);
28131 Chain = LockArith.getValue(1);
28132 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
28133 }
28134 }
28135 return SDValue();
28136 }
28137
28138 SDLoc dl(Op);
28139 switch(IntrData->Type) {
28140 default: llvm_unreachable("Unknown Intrinsic Type");
28141 case RDSEED:
28142 case RDRAND: {
28143 // Emit the node with the right value type.
28144 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
28145 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28146
28147 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
28148 // Otherwise return the value from Rand, which is always 0, casted to i32.
28149 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
28150 DAG.getConstant(1, dl, Op->getValueType(1)),
28151 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
28152 SDValue(Result.getNode(), 1)};
28153 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
28154
28155 // Return { result, isValid, chain }.
28156 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
28157 SDValue(Result.getNode(), 2));
28158 }
28159 case GATHER_AVX2: {
28160 SDValue Chain = Op.getOperand(0);
28161 SDValue Src = Op.getOperand(2);
28162 SDValue Base = Op.getOperand(3);
28163 SDValue Index = Op.getOperand(4);
28164 SDValue Mask = Op.getOperand(5);
28165 SDValue Scale = Op.getOperand(6);
28166 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28167 Scale, Chain, Subtarget);
28168 }
28169 case GATHER: {
28170 //gather(v1, mask, index, base, scale);
28171 SDValue Chain = Op.getOperand(0);
28172 SDValue Src = Op.getOperand(2);
28173 SDValue Base = Op.getOperand(3);
28174 SDValue Index = Op.getOperand(4);
28175 SDValue Mask = Op.getOperand(5);
28176 SDValue Scale = Op.getOperand(6);
28177 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
28178 Chain, Subtarget);
28179 }
28180 case SCATTER: {
28181 //scatter(base, mask, index, v1, scale);
28182 SDValue Chain = Op.getOperand(0);
28183 SDValue Base = Op.getOperand(2);
28184 SDValue Mask = Op.getOperand(3);
28185 SDValue Index = Op.getOperand(4);
28186 SDValue Src = Op.getOperand(5);
28187 SDValue Scale = Op.getOperand(6);
28188 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28189 Scale, Chain, Subtarget);
28190 }
28191 case PREFETCH: {
28192 const APInt &HintVal = Op.getConstantOperandAPInt(6);
28193 assert((HintVal == 2 || HintVal == 3) &&
28194 "Wrong prefetch hint in intrinsic: should be 2 or 3");
28195 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
28196 SDValue Chain = Op.getOperand(0);
28197 SDValue Mask = Op.getOperand(2);
28198 SDValue Index = Op.getOperand(3);
28199 SDValue Base = Op.getOperand(4);
28200 SDValue Scale = Op.getOperand(5);
28201 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
28202 Subtarget);
28203 }
28204 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
28205 case RDTSC: {
28207 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
28208 Results);
28209 return DAG.getMergeValues(Results, dl);
28210 }
28211 // Read Performance Monitoring Counters.
28212 case RDPMC:
28213 // Read Processor Register.
28214 case RDPRU:
28215 // GetExtended Control Register.
28216 case XGETBV: {
28218
28219 // RDPMC uses ECX to select the index of the performance counter to read.
28220 // RDPRU uses ECX to select the processor register to read.
28221 // XGETBV uses ECX to select the index of the XCR register to return.
28222 // The result is stored into registers EDX:EAX.
28223 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
28224 Subtarget, Results);
28225 return DAG.getMergeValues(Results, dl);
28226 }
28227 // XTEST intrinsics.
28228 case XTEST: {
28229 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
28230 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28231
28232 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
28233 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
28234 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
28235 Ret, SDValue(InTrans.getNode(), 1));
28236 }
28239 case TRUNCATE_TO_MEM_VI32: {
28240 SDValue Mask = Op.getOperand(4);
28241 SDValue DataToTruncate = Op.getOperand(3);
28242 SDValue Addr = Op.getOperand(2);
28243 SDValue Chain = Op.getOperand(0);
28244
28246 assert(MemIntr && "Expected MemIntrinsicSDNode!");
28247
28248 EVT MemVT = MemIntr->getMemoryVT();
28249
28250 uint16_t TruncationOp = IntrData->Opc0;
28251 switch (TruncationOp) {
28252 case X86ISD::VTRUNC: {
28253 if (isAllOnesConstant(Mask)) // return just a truncate store
28254 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
28255 MemIntr->getMemOperand());
28256
28257 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28258 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28259 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
28260
28261 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
28262 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
28263 true /* truncating */);
28264 }
28265 case X86ISD::VTRUNCUS:
28266 case X86ISD::VTRUNCS: {
28267 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
28268 if (isAllOnesConstant(Mask))
28269 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
28270 MemIntr->getMemOperand(), DAG);
28271
28272 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28273 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28274
28275 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
28276 VMask, MemVT, MemIntr->getMemOperand(), DAG);
28277 }
28278 default:
28279 llvm_unreachable("Unsupported truncstore intrinsic");
28280 }
28281 }
28282 case INTR_TYPE_CAST_MMX:
28283 return SDValue(); // handled in combineINTRINSIC_*
28284 }
28285}
28286
28287SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
28288 SelectionDAG &DAG) const {
28289 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
28290 MFI.setReturnAddressIsTaken(true);
28291
28292 unsigned Depth = Op.getConstantOperandVal(0);
28293 SDLoc dl(Op);
28294 EVT PtrVT = Op.getValueType();
28295
28296 if (Depth > 0) {
28297 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
28298 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28299 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
28300 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
28301 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
28302 MachinePointerInfo());
28303 }
28304
28305 // Just load the return address.
28306 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
28307 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
28308 MachinePointerInfo());
28309}
28310
28311SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
28312 SelectionDAG &DAG) const {
28314 return getReturnAddressFrameIndex(DAG);
28315}
28316
28317SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
28318 MachineFunction &MF = DAG.getMachineFunction();
28319 MachineFrameInfo &MFI = MF.getFrameInfo();
28320 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
28321 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28322 EVT VT = Op.getValueType();
28323
28324 MFI.setFrameAddressIsTaken(true);
28325
28326 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
28327 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
28328 // is not possible to crawl up the stack without looking at the unwind codes
28329 // simultaneously.
28330 int FrameAddrIndex = FuncInfo->getFAIndex();
28331 if (!FrameAddrIndex) {
28332 // Set up a frame object for the return address.
28333 unsigned SlotSize = RegInfo->getSlotSize();
28334 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
28335 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
28336 FuncInfo->setFAIndex(FrameAddrIndex);
28337 }
28338 return DAG.getFrameIndex(FrameAddrIndex, VT);
28339 }
28340
28341 Register FrameReg =
28342 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
28343 SDLoc dl(Op); // FIXME probably not meaningful
28344 unsigned Depth = Op.getConstantOperandVal(0);
28345 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
28346 (FrameReg == X86::EBP && VT == MVT::i32)) &&
28347 "Invalid Frame Register!");
28348 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
28349 while (Depth--)
28350 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
28351 MachinePointerInfo());
28352 return FrameAddr;
28353}
28354
28355// FIXME? Maybe this could be a TableGen attribute on some registers and
28356// this table could be generated automatically from RegInfo.
28358 const MachineFunction &MF) const {
28359 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
28360
28362 .Case("esp", X86::ESP)
28363 .Case("rsp", X86::RSP)
28364 .Case("ebp", X86::EBP)
28365 .Case("rbp", X86::RBP)
28366 .Case("r14", X86::R14)
28367 .Case("r15", X86::R15)
28368 .Default(0);
28369
28370 if (Reg == X86::EBP || Reg == X86::RBP) {
28371 if (!TFI.hasFP(MF))
28372 report_fatal_error("register " + StringRef(RegName) +
28373 " is allocatable: function has no frame pointer");
28374#ifndef NDEBUG
28375 else {
28376 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28377 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
28378 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
28379 "Invalid Frame Register!");
28380 }
28381#endif
28382 }
28383
28384 return Reg;
28385}
28386
28387SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
28388 SelectionDAG &DAG) const {
28389 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28390 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
28391}
28392
28394 const Constant *PersonalityFn) const {
28395 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
28396 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28397
28398 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
28399}
28400
28402 const Constant *PersonalityFn) const {
28403 // Funclet personalities don't use selectors (the runtime does the selection).
28405 return X86::NoRegister;
28406 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28407}
28408
28410 return Subtarget.isTargetWin64();
28411}
28412
28413SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
28414 SDValue Chain = Op.getOperand(0);
28415 SDValue Offset = Op.getOperand(1);
28416 SDValue Handler = Op.getOperand(2);
28417 SDLoc dl (Op);
28418
28419 EVT PtrVT = getPointerTy(DAG.getDataLayout());
28420 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28421 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
28422 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
28423 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
28424 "Invalid Frame Register!");
28425 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
28426 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
28427
28428 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
28429 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
28430 dl));
28431 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
28432 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
28433 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
28434
28435 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
28436 DAG.getRegister(StoreAddrReg, PtrVT));
28437}
28438
28439SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
28440 SelectionDAG &DAG) const {
28441 SDLoc DL(Op);
28442 // If the subtarget is not 64bit, we may need the global base reg
28443 // after isel expand pseudo, i.e., after CGBR pass ran.
28444 // Therefore, ask for the GlobalBaseReg now, so that the pass
28445 // inserts the code for us in case we need it.
28446 // Otherwise, we will end up in a situation where we will
28447 // reference a virtual register that is not defined!
28448 if (!Subtarget.is64Bit()) {
28449 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28450 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
28451 }
28452 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
28453 DAG.getVTList(MVT::i32, MVT::Other),
28454 Op.getOperand(0), Op.getOperand(1));
28455}
28456
28457SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
28458 SelectionDAG &DAG) const {
28459 SDLoc DL(Op);
28460 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
28461 Op.getOperand(0), Op.getOperand(1));
28462}
28463
28464SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
28465 SelectionDAG &DAG) const {
28466 SDLoc DL(Op);
28467 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
28468 Op.getOperand(0));
28469}
28470
28472 return Op.getOperand(0);
28473}
28474
28475SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
28476 SelectionDAG &DAG) const {
28477 SDValue Root = Op.getOperand(0);
28478 SDValue Trmp = Op.getOperand(1); // trampoline
28479 SDValue FPtr = Op.getOperand(2); // nested function
28480 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
28481 SDLoc dl (Op);
28482
28483 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
28484 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
28485
28486 if (Subtarget.is64Bit()) {
28487 SDValue OutChains[6];
28488
28489 // Large code-model.
28490 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
28491 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
28492
28493 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
28494 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
28495
28496 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
28497
28498 // Load the pointer to the nested function into R11.
28499 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
28500 SDValue Addr = Trmp;
28501 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28502 Addr, MachinePointerInfo(TrmpAddr));
28503
28504 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28505 DAG.getConstant(2, dl, MVT::i64));
28506 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
28507 MachinePointerInfo(TrmpAddr, 2), Align(2));
28508
28509 // Load the 'nest' parameter value into R10.
28510 // R10 is specified in X86CallingConv.td
28511 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
28512 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28513 DAG.getConstant(10, dl, MVT::i64));
28514 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28515 Addr, MachinePointerInfo(TrmpAddr, 10));
28516
28517 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28518 DAG.getConstant(12, dl, MVT::i64));
28519 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
28520 MachinePointerInfo(TrmpAddr, 12), Align(2));
28521
28522 // Jump to the nested function.
28523 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
28524 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28525 DAG.getConstant(20, dl, MVT::i64));
28526 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28527 Addr, MachinePointerInfo(TrmpAddr, 20));
28528
28529 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
28530 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28531 DAG.getConstant(22, dl, MVT::i64));
28532 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
28533 Addr, MachinePointerInfo(TrmpAddr, 22));
28534
28535 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28536 } else {
28537 const Function *Func =
28538 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
28539 CallingConv::ID CC = Func->getCallingConv();
28540 unsigned NestReg;
28541
28542 switch (CC) {
28543 default:
28544 llvm_unreachable("Unsupported calling convention");
28545 case CallingConv::C:
28547 // Pass 'nest' parameter in ECX.
28548 // Must be kept in sync with X86CallingConv.td
28549 NestReg = X86::ECX;
28550
28551 // Check that ECX wasn't needed by an 'inreg' parameter.
28552 FunctionType *FTy = Func->getFunctionType();
28553 const AttributeList &Attrs = Func->getAttributes();
28554
28555 if (!Attrs.isEmpty() && !Func->isVarArg()) {
28556 unsigned InRegCount = 0;
28557 unsigned Idx = 0;
28558
28559 for (FunctionType::param_iterator I = FTy->param_begin(),
28560 E = FTy->param_end(); I != E; ++I, ++Idx)
28561 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
28562 const DataLayout &DL = DAG.getDataLayout();
28563 // FIXME: should only count parameters that are lowered to integers.
28564 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
28565 }
28566
28567 if (InRegCount > 2) {
28568 report_fatal_error("Nest register in use - reduce number of inreg"
28569 " parameters!");
28570 }
28571 }
28572 break;
28573 }
28576 case CallingConv::Fast:
28577 case CallingConv::Tail:
28579 // Pass 'nest' parameter in EAX.
28580 // Must be kept in sync with X86CallingConv.td
28581 NestReg = X86::EAX;
28582 break;
28583 }
28584
28585 SDValue OutChains[4];
28586 SDValue Addr, Disp;
28587
28588 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28589 DAG.getConstant(10, dl, MVT::i32));
28590 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
28591
28592 // This is storing the opcode for MOV32ri.
28593 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
28594 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
28595 OutChains[0] =
28596 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
28597 Trmp, MachinePointerInfo(TrmpAddr));
28598
28599 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28600 DAG.getConstant(1, dl, MVT::i32));
28601 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
28602 MachinePointerInfo(TrmpAddr, 1), Align(1));
28603
28604 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
28605 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28606 DAG.getConstant(5, dl, MVT::i32));
28607 OutChains[2] =
28608 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
28609 MachinePointerInfo(TrmpAddr, 5), Align(1));
28610
28611 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28612 DAG.getConstant(6, dl, MVT::i32));
28613 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
28614 MachinePointerInfo(TrmpAddr, 6), Align(1));
28615
28616 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28617 }
28618}
28619
28620SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
28621 SelectionDAG &DAG) const {
28622 /*
28623 The rounding mode is in bits 11:10 of FPSR, and has the following
28624 settings:
28625 00 Round to nearest
28626 01 Round to -inf
28627 10 Round to +inf
28628 11 Round to 0
28629
28630 GET_ROUNDING, on the other hand, expects the following:
28631 -1 Undefined
28632 0 Round to 0
28633 1 Round to nearest
28634 2 Round to +inf
28635 3 Round to -inf
28636
28637 To perform the conversion, we use a packed lookup table of the four 2-bit
28638 values that we can index by FPSP[11:10]
28639 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
28640
28641 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
28642 */
28643
28644 MachineFunction &MF = DAG.getMachineFunction();
28645 MVT VT = Op.getSimpleValueType();
28646 SDLoc DL(Op);
28647
28648 // Save FP Control Word to stack slot
28649 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
28650 SDValue StackSlot =
28651 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
28652
28653 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
28654
28655 SDValue Chain = Op.getOperand(0);
28656 SDValue Ops[] = {Chain, StackSlot};
28658 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
28660
28661 // Load FP Control Word from stack slot
28662 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
28663 Chain = CWD.getValue(1);
28664
28665 // Mask and turn the control bits into a shift for the lookup table.
28666 SDValue Shift =
28667 DAG.getNode(ISD::SRL, DL, MVT::i16,
28668 DAG.getNode(ISD::AND, DL, MVT::i16,
28669 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
28670 DAG.getConstant(9, DL, MVT::i8));
28671 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
28672
28673 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
28674 SDValue RetVal =
28675 DAG.getNode(ISD::AND, DL, MVT::i32,
28676 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
28677 DAG.getConstant(3, DL, MVT::i32));
28678
28679 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
28680
28681 return DAG.getMergeValues({RetVal, Chain}, DL);
28682}
28683
28684SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
28685 SelectionDAG &DAG) const {
28686 MachineFunction &MF = DAG.getMachineFunction();
28687 SDLoc DL(Op);
28688 SDValue Chain = Op.getNode()->getOperand(0);
28689
28690 // FP control word may be set only from data in memory. So we need to allocate
28691 // stack space to save/load FP control word.
28692 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
28693 SDValue StackSlot =
28694 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
28695 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
28696 MachineMemOperand *MMO =
28698
28699 // Store FP control word into memory.
28700 SDValue Ops[] = {Chain, StackSlot};
28701 Chain = DAG.getMemIntrinsicNode(
28702 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
28703
28704 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
28705 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
28706 Chain = CWD.getValue(1);
28707 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
28708 DAG.getConstant(0xf3ff, DL, MVT::i16));
28709
28710 // Calculate new rounding mode.
28711 SDValue NewRM = Op.getNode()->getOperand(1);
28712 SDValue RMBits;
28713 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
28714 uint64_t RM = CVal->getZExtValue();
28715 int FieldVal = X86::getRoundingModeX86(RM);
28716
28717 if (FieldVal == X86::rmInvalid) {
28718 FieldVal = X86::rmToNearest;
28719 LLVMContext &C = MF.getFunction().getContext();
28720 C.diagnose(DiagnosticInfoUnsupported(
28721 MF.getFunction(), "rounding mode is not supported by X86 hardware",
28722 DiagnosticLocation(DL.getDebugLoc()), DS_Error));
28723 }
28724 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
28725 } else {
28726 // Need to convert argument into bits of control word:
28727 // 0 Round to 0 -> 11
28728 // 1 Round to nearest -> 00
28729 // 2 Round to +inf -> 10
28730 // 3 Round to -inf -> 01
28731 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
28732 // To make the conversion, put all these values into a value 0xc9 and shift
28733 // it left depending on the rounding mode:
28734 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
28735 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
28736 // ...
28737 // (0xc9 << (2 * NewRM + 4)) & 0xc00
28738 SDValue ShiftValue =
28739 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
28740 DAG.getNode(ISD::ADD, DL, MVT::i32,
28741 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
28742 DAG.getConstant(1, DL, MVT::i8)),
28743 DAG.getConstant(4, DL, MVT::i32)));
28744 SDValue Shifted =
28745 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
28746 ShiftValue);
28747 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
28748 DAG.getConstant(0xc00, DL, MVT::i16));
28749 }
28750
28751 // Update rounding mode bits and store the new FP Control Word into stack.
28752 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
28753 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
28754
28755 // Load FP control word from the slot.
28756 SDValue OpsLD[] = {Chain, StackSlot};
28757 MachineMemOperand *MMOL =
28759 Chain = DAG.getMemIntrinsicNode(
28760 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
28761
28762 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
28763 // same way but in bits 14:13.
28764 if (Subtarget.hasSSE1()) {
28765 // Store MXCSR into memory.
28766 Chain = DAG.getNode(
28767 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28768 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28769 StackSlot);
28770
28771 // Load MXCSR from stack slot and clear RM field (bits 14:13).
28772 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
28773 Chain = CWD.getValue(1);
28774 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
28775 DAG.getConstant(0xffff9fff, DL, MVT::i32));
28776
28777 // Shift X87 RM bits from 11:10 to 14:13.
28778 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
28779 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
28780 DAG.getConstant(3, DL, MVT::i8));
28781
28782 // Update rounding mode bits and store the new FP Control Word into stack.
28783 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
28784 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
28785
28786 // Load MXCSR from the slot.
28787 Chain = DAG.getNode(
28788 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28789 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28790 StackSlot);
28791 }
28792
28793 return Chain;
28794}
28795
28796const unsigned X87StateSize = 28;
28797const unsigned FPStateSize = 32;
28798[[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;
28799
28800SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,
28801 SelectionDAG &DAG) const {
28803 SDLoc DL(Op);
28804 SDValue Chain = Op->getOperand(0);
28805 SDValue Ptr = Op->getOperand(1);
28807 EVT MemVT = Node->getMemoryVT();
28809 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28810
28811 // Get x87 state, if it presents.
28812 if (Subtarget.hasX87()) {
28813 Chain =
28814 DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),
28815 {Chain, Ptr}, MemVT, MMO);
28816
28817 // FNSTENV changes the exception mask, so load back the stored environment.
28818 MachineMemOperand::Flags NewFlags =
28821 MMO = MF.getMachineMemOperand(MMO, NewFlags);
28822 Chain =
28823 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28824 {Chain, Ptr}, MemVT, MMO);
28825 }
28826
28827 // If target supports SSE, get MXCSR as well.
28828 if (Subtarget.hasSSE1()) {
28829 // Get pointer to the MXCSR location in memory.
28831 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28832 DAG.getConstant(X87StateSize, DL, PtrVT));
28833 // Store MXCSR into memory.
28834 Chain = DAG.getNode(
28835 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28836 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28837 MXCSRAddr);
28838 }
28839
28840 return Chain;
28841}
28842
28844 EVT MemVT, MachineMemOperand *MMO,
28845 SelectionDAG &DAG,
28846 const X86Subtarget &Subtarget) {
28847 // Set x87 state, if it presents.
28848 if (Subtarget.hasX87())
28849 Chain =
28850 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28851 {Chain, Ptr}, MemVT, MMO);
28852 // If target supports SSE, set MXCSR as well.
28853 if (Subtarget.hasSSE1()) {
28854 // Get pointer to the MXCSR location in memory.
28856 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28857 DAG.getConstant(X87StateSize, DL, PtrVT));
28858 // Load MXCSR from memory.
28859 Chain = DAG.getNode(
28860 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28861 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28862 MXCSRAddr);
28863 }
28864 return Chain;
28865}
28866
28867SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,
28868 SelectionDAG &DAG) const {
28869 SDLoc DL(Op);
28870 SDValue Chain = Op->getOperand(0);
28871 SDValue Ptr = Op->getOperand(1);
28873 EVT MemVT = Node->getMemoryVT();
28875 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28876 return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);
28877}
28878
28879SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
28880 SelectionDAG &DAG) const {
28881 MachineFunction &MF = DAG.getMachineFunction();
28882 SDLoc DL(Op);
28883 SDValue Chain = Op.getNode()->getOperand(0);
28884
28885 IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());
28886 ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);
28888
28889 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
28890 // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise
28891 // for compatibility with glibc.
28892 unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;
28893 FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));
28894 Constant *Zero = ConstantInt::get(ItemTy, 0);
28895 for (unsigned I = 0; I < 6; ++I)
28896 FPEnvVals.push_back(Zero);
28897
28898 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
28899 // all exceptions, sets DAZ and FTZ to 0.
28900 FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));
28901 Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);
28902 MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
28903 SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);
28904 MachinePointerInfo MPI =
28906 MachineMemOperand *MMO = MF.getMachineMemOperand(
28908
28909 return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
28910}
28911
28912// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28913uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
28914 assert((Amt < 8) && "Shift/Rotation amount out of range");
28915 switch (Opcode) {
28916 case ISD::BITREVERSE:
28917 return 0x8040201008040201ULL;
28918 case ISD::SHL:
28919 return ((0x0102040810204080ULL >> (Amt)) &
28920 (0x0101010101010101ULL * (0xFF >> (Amt))));
28921 case ISD::SRL:
28922 return ((0x0102040810204080ULL << (Amt)) &
28923 (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
28924 case ISD::SRA:
28925 return (getGFNICtrlImm(ISD::SRL, Amt) |
28926 (0x8080808080808080ULL >> (64 - (8 * Amt))));
28927 case ISD::ROTL:
28928 return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
28929 case ISD::ROTR:
28930 return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
28931 }
28932 llvm_unreachable("Unsupported GFNI opcode");
28933}
28934
28935// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28936SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL,
28937 MVT VT, unsigned Amt = 0) {
28938 assert(VT.getVectorElementType() == MVT::i8 &&
28939 (VT.getSizeInBits() % 64) == 0 && "Illegal GFNI control type");
28940 uint64_t Imm = getGFNICtrlImm(Opcode, Amt);
28941 SmallVector<SDValue> MaskBits;
28942 for (unsigned I = 0, E = VT.getSizeInBits(); I != E; I += 8) {
28943 uint64_t Bits = (Imm >> (I % 64)) & 255;
28944 MaskBits.push_back(DAG.getConstant(Bits, DL, MVT::i8));
28945 }
28946 return DAG.getBuildVector(VT, DL, MaskBits);
28947}
28948
28949/// Lower a vector CTLZ using native supported vector CTLZ instruction.
28950//
28951// i8/i16 vector implemented using dword LZCNT vector instruction
28952// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
28953// split the vector, perform operation on it's Lo a Hi part and
28954// concatenate the results.
28956 const X86Subtarget &Subtarget) {
28957 assert(Op.getOpcode() == ISD::CTLZ);
28958 SDLoc dl(Op);
28959 MVT VT = Op.getSimpleValueType();
28960 MVT EltVT = VT.getVectorElementType();
28961 unsigned NumElems = VT.getVectorNumElements();
28962
28963 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
28964 "Unsupported element type");
28965
28966 // Split vector, it's Lo and Hi parts will be handled in next iteration.
28967 if (NumElems > 16 ||
28968 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
28969 return splitVectorIntUnary(Op, DAG, dl);
28970
28971 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
28972 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
28973 "Unsupported value type for operation");
28974
28975 // Use native supported vector instruction vplzcntd.
28976 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
28977 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
28978 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
28979 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
28980
28981 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
28982}
28983
28984// Lower CTLZ using a PSHUFB lookup table implementation.
28986 const X86Subtarget &Subtarget,
28987 SelectionDAG &DAG) {
28988 MVT VT = Op.getSimpleValueType();
28989 int NumElts = VT.getVectorNumElements();
28990 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
28991 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
28992
28993 // Per-nibble leading zero PSHUFB lookup table.
28994 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
28995 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
28996 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
28997 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
28998
29000 for (int i = 0; i < NumBytes; ++i)
29001 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
29002 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
29003
29004 // Begin by bitcasting the input to byte vector, then split those bytes
29005 // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them.
29006 // If the hi input nibble is zero then we add both results together, otherwise
29007 // we just take the hi result (by masking the lo result to zero before the
29008 // add).
29009 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
29010 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
29011
29012 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
29013 SDValue Lo = Op0;
29014 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
29015 SDValue HiZ;
29016 if (CurrVT.is512BitVector()) {
29017 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29018 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
29019 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29020 } else {
29021 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
29022 }
29023
29024 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
29025 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
29026 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
29027 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
29028
29029 // Merge result back from vXi8 back to VT, working on the lo/hi halves
29030 // of the current vector width in the same way we did for the nibbles.
29031 // If the upper half of the input element is zero then add the halves'
29032 // leading zero counts together, otherwise just use the upper half's.
29033 // Double the width of the result until we are at target width.
29034 while (CurrVT != VT) {
29035 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
29036 int CurrNumElts = CurrVT.getVectorNumElements();
29037 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
29038 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
29039 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
29040
29041 // Check if the upper half of the input element is zero.
29042 if (CurrVT.is512BitVector()) {
29043 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29044 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
29045 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29046 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29047 } else {
29048 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
29049 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29050 }
29051 HiZ = DAG.getBitcast(NextVT, HiZ);
29052
29053 // Move the upper/lower halves to the lower bits as we'll be extending to
29054 // NextVT. Mask the lower result to zero if HiZ is true and add the results
29055 // together.
29056 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
29057 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
29058 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
29059 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
29060 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
29061 CurrVT = NextVT;
29062 }
29063
29064 return Res;
29065}
29066
29068 const X86Subtarget &Subtarget,
29069 SelectionDAG &DAG) {
29070 MVT VT = Op.getSimpleValueType();
29071
29072 if (Subtarget.hasCDI() &&
29073 // vXi8 vectors need to be promoted to 512-bits for vXi32.
29074 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
29075 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
29076
29077 // Decompose 256-bit ops into smaller 128-bit ops.
29078 if (VT.is256BitVector() && !Subtarget.hasInt256())
29079 return splitVectorIntUnary(Op, DAG, DL);
29080
29081 // Decompose 512-bit ops into smaller 256-bit ops.
29082 if (VT.is512BitVector() && !Subtarget.hasBWI())
29083 return splitVectorIntUnary(Op, DAG, DL);
29084
29085 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
29086 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
29087}
29088
29090 SelectionDAG &DAG,
29091 const X86Subtarget &Subtarget) {
29092 MVT VT = Op.getSimpleValueType();
29093 SDValue Input = Op.getOperand(0);
29094
29095 assert(VT.isVector() && VT.getVectorElementType() == MVT::i8 &&
29096 "Expected vXi8 input for GFNI-based CTLZ lowering");
29097
29098 SDValue Reversed = DAG.getNode(ISD::BITREVERSE, DL, VT, Input);
29099
29100 SDValue Neg = DAG.getNegative(Reversed, DL, VT);
29101 SDValue Filtered = DAG.getNode(ISD::AND, DL, VT, Reversed, Neg);
29102
29103 MVT VT64 = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
29104 SDValue CTTZConst = DAG.getConstant(0xAACCF0FF00000000ULL, DL, VT64);
29105 SDValue CTTZMatrix = DAG.getBitcast(VT, CTTZConst);
29106
29107 SDValue LZCNT =
29108 DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, Filtered, CTTZMatrix,
29109 DAG.getTargetConstant(8, DL, MVT::i8));
29110 return LZCNT;
29111}
29112
29113static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
29114 SelectionDAG &DAG) {
29115 MVT VT = Op.getSimpleValueType();
29116 MVT OpVT = VT;
29117 unsigned NumBits = VT.getSizeInBits();
29118 SDLoc dl(Op);
29119 unsigned Opc = Op.getOpcode();
29120
29121 if (VT.isVector() && VT.getScalarType() == MVT::i8 && Subtarget.hasGFNI())
29122 return LowerVectorCTLZ_GFNI(Op, dl, DAG, Subtarget);
29123
29124 if (VT.isVector())
29125 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
29126
29127 Op = Op.getOperand(0);
29128 if (VT == MVT::i8) {
29129 // Zero extend to i32 since there is not an i8 bsr.
29130 OpVT = MVT::i32;
29131 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
29132 }
29133
29134 // Check if we can safely pass a result though BSR for zero sources.
29135 SDValue PassThru = DAG.getUNDEF(OpVT);
29136 if (Opc == ISD::CTLZ && Subtarget.hasBitScanPassThrough() &&
29137 !DAG.isKnownNeverZero(Op))
29138 PassThru = DAG.getConstant(NumBits + NumBits - 1, dl, OpVT);
29139
29140 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
29141 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
29142 Op = DAG.getNode(X86ISD::BSR, dl, VTs, PassThru, Op);
29143
29144 // Skip CMOV if we're using a pass through value.
29145 if (Opc == ISD::CTLZ && PassThru.isUndef()) {
29146 // If src is zero (i.e. bsr sets ZF), returns NumBits.
29147 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
29148 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29149 Op.getValue(1)};
29150 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
29151 }
29152
29153 // Finally xor with NumBits-1.
29154 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
29155 DAG.getConstant(NumBits - 1, dl, OpVT));
29156
29157 if (VT == MVT::i8)
29158 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
29159 return Op;
29160}
29161
29162static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
29163 SelectionDAG &DAG) {
29164 MVT VT = Op.getSimpleValueType();
29165 unsigned NumBits = VT.getScalarSizeInBits();
29166 SDValue N0 = Op.getOperand(0);
29167 SDLoc dl(Op);
29168 bool NonZeroSrc = DAG.isKnownNeverZero(N0);
29169
29170 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
29171 "Only scalar CTTZ requires custom lowering");
29172
29173 // Check if we can safely pass a result though BSF for zero sources.
29174 SDValue PassThru = DAG.getUNDEF(VT);
29175 if (!NonZeroSrc && Subtarget.hasBitScanPassThrough())
29176 PassThru = DAG.getConstant(NumBits, dl, VT);
29177
29178 // Issue a bsf (scan bits forward) which also sets EFLAGS.
29179 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29180 Op = DAG.getNode(X86ISD::BSF, dl, VTs, PassThru, N0);
29181
29182 // Skip CMOV if src is never zero or we're using a pass through value.
29183 if (NonZeroSrc || !PassThru.isUndef())
29184 return Op;
29185
29186 // If src is zero (i.e. bsf sets ZF), returns NumBits.
29187 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
29188 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29189 Op.getValue(1)};
29190 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
29191}
29192
29194 const X86Subtarget &Subtarget) {
29195 MVT VT = Op.getSimpleValueType();
29196 SDLoc DL(Op);
29197
29198 if (VT == MVT::i16 || VT == MVT::i32)
29199 return lowerAddSubToHorizontalOp(Op, DL, DAG, Subtarget);
29200
29201 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29202 return splitVectorIntBinary(Op, DAG, DL);
29203
29204 assert(Op.getSimpleValueType().is256BitVector() &&
29205 Op.getSimpleValueType().isInteger() &&
29206 "Only handle AVX 256-bit vector integer operation");
29207 return splitVectorIntBinary(Op, DAG, DL);
29208}
29209
29211 const X86Subtarget &Subtarget) {
29212 MVT VT = Op.getSimpleValueType();
29213 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
29214 unsigned Opcode = Op.getOpcode();
29215 SDLoc DL(Op);
29216
29217 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
29218 (VT.is256BitVector() && !Subtarget.hasInt256())) {
29219 assert(Op.getSimpleValueType().isInteger() &&
29220 "Only handle AVX vector integer operation");
29221 return splitVectorIntBinary(Op, DAG, DL);
29222 }
29223
29224 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
29225 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29226 EVT SetCCResultType =
29227 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29228
29229 unsigned BitWidth = VT.getScalarSizeInBits();
29230 if (Opcode == ISD::USUBSAT) {
29231 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
29232 // Handle a special-case with a bit-hack instead of cmp+select:
29233 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
29234 // If the target can use VPTERNLOG, DAGToDAG will match this as
29235 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
29236 // "broadcast" constant load.
29238 if (C && C->getAPIntValue().isSignMask()) {
29239 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
29240 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
29241 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
29242 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
29243 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
29244 }
29245 }
29246 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
29247 // usubsat X, Y --> (X >u Y) ? X - Y : 0
29248 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
29249 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
29250 // TODO: Move this to DAGCombiner?
29251 if (SetCCResultType == VT &&
29252 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
29253 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
29254 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
29255 }
29256 }
29257
29258 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
29259 (!VT.isVector() || VT == MVT::v2i64)) {
29262 SDValue Zero = DAG.getConstant(0, DL, VT);
29263 SDValue Result =
29264 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
29265 DAG.getVTList(VT, SetCCResultType), X, Y);
29266 SDValue SumDiff = Result.getValue(0);
29267 SDValue Overflow = Result.getValue(1);
29268 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
29269 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
29270 SDValue SumNeg =
29271 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
29272 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
29273 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
29274 }
29275
29276 // Use default expansion.
29277 return SDValue();
29278}
29279
29280static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
29281 SelectionDAG &DAG) {
29282 MVT VT = Op.getSimpleValueType();
29283 SDLoc DL(Op);
29284
29285 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
29286 // Since X86 does not have CMOV for 8-bit integer, we don't convert
29287 // 8-bit integer abs to NEG and CMOV.
29288 SDValue N0 = Op.getOperand(0);
29289 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
29290 DAG.getConstant(0, DL, VT), N0);
29291 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
29292 SDValue(Neg.getNode(), 1)};
29293 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
29294 }
29295
29296 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
29297 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
29298 SDValue Src = Op.getOperand(0);
29299 SDValue Neg = DAG.getNegative(Src, DL, VT);
29300 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);
29301 }
29302
29303 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
29304 assert(VT.isInteger() &&
29305 "Only handle AVX 256-bit vector integer operation");
29306 return splitVectorIntUnary(Op, DAG, DL);
29307 }
29308
29309 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29310 return splitVectorIntUnary(Op, DAG, DL);
29311
29312 // Default to expand.
29313 return SDValue();
29314}
29315
29316static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
29317 SelectionDAG &DAG) {
29318 MVT VT = Op.getSimpleValueType();
29319 SDLoc DL(Op);
29320
29321 // For AVX1 cases, split to use legal ops.
29322 if (VT.is256BitVector() && !Subtarget.hasInt256())
29323 return splitVectorIntBinary(Op, DAG, DL);
29324
29325 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29326 return splitVectorIntBinary(Op, DAG, DL);
29327
29328 // Default to expand.
29329 return SDValue();
29330}
29331
29332static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
29333 SelectionDAG &DAG) {
29334 MVT VT = Op.getSimpleValueType();
29335 SDLoc DL(Op);
29336
29337 // For AVX1 cases, split to use legal ops.
29338 if (VT.is256BitVector() && !Subtarget.hasInt256())
29339 return splitVectorIntBinary(Op, DAG, DL);
29340
29341 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29342 return splitVectorIntBinary(Op, DAG, DL);
29343
29344 // Default to expand.
29345 return SDValue();
29346}
29347
29349 SelectionDAG &DAG) {
29350 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29351 EVT VT = Op.getValueType();
29352 SDValue X = Op.getOperand(0);
29353 SDValue Y = Op.getOperand(1);
29354 SDLoc DL(Op);
29355 bool IsMaxOp =
29356 Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29357 bool IsNum =
29358 Op.getOpcode() == ISD::FMINIMUMNUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29359 if (Subtarget.hasAVX10_2() && TLI.isTypeLegal(VT)) {
29360 unsigned Opc = 0;
29361 if (VT.isVector())
29363 else if (VT == MVT::f16 || VT == MVT::f32 || VT == MVT::f64)
29365
29366 if (Opc) {
29367 SDValue Imm =
29368 DAG.getTargetConstant(IsMaxOp + (IsNum ? 16 : 0), DL, MVT::i32);
29369 return DAG.getNode(Opc, DL, VT, X, Y, Imm, Op->getFlags());
29370 }
29371 }
29372
29373 uint64_t SizeInBits = VT.getScalarSizeInBits();
29374 APInt PreferredZero = APInt::getZero(SizeInBits);
29375 APInt OppositeZero = PreferredZero;
29376 EVT IVT = VT.changeTypeToInteger();
29377 X86ISD::NodeType MinMaxOp;
29378 if (IsMaxOp) {
29379 MinMaxOp = X86ISD::FMAX;
29380 OppositeZero.setSignBit();
29381 } else {
29382 PreferredZero.setSignBit();
29383 MinMaxOp = X86ISD::FMIN;
29384 }
29385 EVT SetCCType =
29386 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29387
29388 // The tables below show the expected result of Max in cases of NaN and
29389 // signed zeros.
29390 //
29391 // Y Y
29392 // Num xNaN +0 -0
29393 // --------------- ---------------
29394 // Num | Max | Y | +0 | +0 | +0 |
29395 // X --------------- X ---------------
29396 // xNaN | X | X/Y | -0 | +0 | -0 |
29397 // --------------- ---------------
29398 //
29399 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
29400 // reordering.
29401 //
29402 // We check if any of operands is NaN and return NaN. Then we check if any of
29403 // operands is zero or negative zero (for fmaximum and fminimum respectively)
29404 // to ensure the correct zero is returned.
29405 auto MatchesZero = [](SDValue Op, APInt Zero) {
29407 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
29408 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
29409 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
29410 return CstOp->getAPIntValue() == Zero;
29411 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
29412 Op->getOpcode() == ISD::SPLAT_VECTOR) {
29413 for (const SDValue &OpVal : Op->op_values()) {
29414 if (OpVal.isUndef())
29415 continue;
29416 auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
29417 if (!CstOp)
29418 return false;
29419 if (!CstOp->getValueAPF().isZero())
29420 continue;
29421 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
29422 return false;
29423 }
29424 return true;
29425 }
29426 return false;
29427 };
29428
29429 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
29430 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
29431 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
29432 Op->getFlags().hasNoSignedZeros() ||
29433 DAG.isKnownNeverZeroFloat(X) ||
29435 SDValue NewX, NewY;
29436 if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
29437 MatchesZero(X, OppositeZero)) {
29438 // Operands are already in right order or order does not matter.
29439 NewX = X;
29440 NewY = Y;
29441 } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
29442 NewX = Y;
29443 NewY = X;
29444 } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
29445 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
29446 if (IsXNeverNaN)
29447 std::swap(X, Y);
29448 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
29449 // xmm register.
29450 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
29452 // Bits of classes:
29453 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
29454 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
29455 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
29456 DL, MVT::i32);
29457 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
29458 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
29459 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
29460 DAG.getVectorIdxConstant(0, DL));
29461 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
29462 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
29463 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
29464 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29465 } else {
29466 SDValue IsXSigned;
29467 if (Subtarget.is64Bit() || VT != MVT::f64) {
29468 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
29469 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
29470 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
29471 } else {
29472 assert(VT == MVT::f64);
29473 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
29474 DAG.getConstantFP(0, DL, MVT::v2f64), X,
29475 DAG.getVectorIdxConstant(0, DL));
29476 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
29477 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
29478 DAG.getVectorIdxConstant(1, DL));
29479 Hi = DAG.getBitcast(MVT::i32, Hi);
29480 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
29481 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
29482 *DAG.getContext(), MVT::i32);
29483 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
29484 }
29485 if (MinMaxOp == X86ISD::FMAX) {
29486 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29487 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29488 } else {
29489 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29490 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29491 }
29492 }
29493
29494 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
29495 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
29496
29497 // If we did no ordering operands for signed zero handling and we need
29498 // to process NaN and we know that one of the operands is not NaN then:
29499 // - For minimum/maximum, put it in the first operand,
29500 // - For minimumnum/maximumnum, put it in the second operand,
29501 // and we will not need to post handle NaN after max/min.
29502 if (IgnoreSignedZero && !IgnoreNaN &&
29503 DAG.isKnownNeverNaN(IsNum ? NewX : NewY))
29504 std::swap(NewX, NewY);
29505
29506 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29507
29508 if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX))
29509 return MinMax;
29510
29511 if (DAG.isKnownNeverNaN(NewX))
29512 NewX = NewY;
29513
29514 SDValue IsNaN =
29515 DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
29516
29517 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
29518}
29519
29520static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
29521 SelectionDAG &DAG) {
29522 MVT VT = Op.getSimpleValueType();
29523 SDLoc dl(Op);
29524
29525 // For AVX1 cases, split to use legal ops.
29526 if (VT.is256BitVector() && !Subtarget.hasInt256())
29527 return splitVectorIntBinary(Op, DAG, dl);
29528
29529 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
29530 return splitVectorIntBinary(Op, DAG, dl);
29531
29532 bool IsSigned = Op.getOpcode() == ISD::ABDS;
29533 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29534
29535 if (Subtarget.canUseCMOV() && VT.isScalarInteger()) {
29536 X86::CondCode CC = IsSigned ? X86::COND_L : X86::COND_B;
29537 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29538
29539 // abds(lhs, rhs) -> select(slt(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29540 // abdu(lhs, rhs) -> select(ult(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29541 if (VT.bitsGE(MVT::i32)) {
29542 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29543 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
29544 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
29545 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, VTs, LHS, RHS);
29546 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, VTs, RHS, LHS);
29547 return DAG.getNode(X86ISD::CMOV, dl, VT, Diff1, Diff0,
29548 DAG.getTargetConstant(CC, dl, MVT::i8),
29549 Diff1.getValue(1));
29550 }
29551
29552 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
29553 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
29554 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
29555 MVT WideVT = MVT::getIntegerVT(WideBits);
29556 if (TLI.isTypeLegal(WideVT)) {
29557 SDVTList WideVTs = DAG.getVTList(WideVT, MVT::i32);
29558 SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));
29559 SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));
29560 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, WideVTs, LHS, RHS);
29561 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, WideVTs, RHS, LHS);
29562 SDValue AbsDiff = DAG.getNode(X86ISD::CMOV, dl, WideVT, Diff1, Diff0,
29563 DAG.getTargetConstant(CC, dl, MVT::i8),
29564 Diff1.getValue(1));
29565 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
29566 }
29567 }
29568
29569 // Default to expand.
29570 return SDValue();
29571}
29572
29573static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
29574 SelectionDAG &DAG) {
29575 SDLoc dl(Op);
29576 MVT VT = Op.getSimpleValueType();
29577
29578 // Decompose 256-bit ops into 128-bit ops.
29579 if (VT.is256BitVector() && !Subtarget.hasInt256())
29580 return splitVectorIntBinary(Op, DAG, dl);
29581
29582 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29583 return splitVectorIntBinary(Op, DAG, dl);
29584
29585 SDValue A = Op.getOperand(0);
29586 SDValue B = Op.getOperand(1);
29587
29588 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
29589 // vector pairs, multiply and truncate.
29590 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
29591 unsigned NumElts = VT.getVectorNumElements();
29592 unsigned NumLanes = VT.getSizeInBits() / 128;
29593 unsigned NumEltsPerLane = NumElts / NumLanes;
29594
29595 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29596 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29597 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
29598 return DAG.getNode(
29599 ISD::TRUNCATE, dl, VT,
29600 DAG.getNode(ISD::MUL, dl, ExVT,
29601 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
29602 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
29603 }
29604
29605 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29606
29607 // For vXi8 mul, try PMADDUBSW to avoid the need for extension.
29608 // Don't do this if we only need to unpack one half.
29609 if (Subtarget.hasSSSE3()) {
29610 bool BIsBuildVector = isa<BuildVectorSDNode>(B);
29611 bool IsLoLaneAllZeroOrUndef = BIsBuildVector;
29612 bool IsHiLaneAllZeroOrUndef = BIsBuildVector;
29613 if (BIsBuildVector) {
29614 for (auto [Idx, Val] : enumerate(B->ops())) {
29615 if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
29616 IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29617 else
29618 IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29619 }
29620 }
29621 if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
29622 SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
29623 SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);
29624 SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);
29625 SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);
29626 SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);
29627 RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);
29628 RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,
29629 DAG.getTargetConstant(8, dl, MVT::i8));
29630 return DAG.getNode(ISD::OR, dl, VT, RLo, DAG.getBitcast(VT, RHi));
29631 }
29632 }
29633
29634 // Extract the lo/hi parts to any extend to i16.
29635 // We're going to mask off the low byte of each result element of the
29636 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
29637 // element.
29638 SDValue Undef = DAG.getUNDEF(VT);
29639 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
29640 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
29641
29642 SDValue BLo, BHi;
29643 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29644 // If the RHS is a constant, manually unpackl/unpackh.
29645 SmallVector<SDValue, 16> LoOps, HiOps;
29646 for (unsigned i = 0; i != NumElts; i += 16) {
29647 for (unsigned j = 0; j != 8; ++j) {
29648 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
29649 MVT::i16));
29650 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
29651 MVT::i16));
29652 }
29653 }
29654
29655 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29656 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29657 } else {
29658 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
29659 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
29660 }
29661
29662 // Multiply, mask the lower 8bits of the lo/hi results and pack.
29663 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
29664 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
29665 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29666 }
29667
29668 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
29669 if (VT == MVT::v4i32) {
29670 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
29671 "Should not custom lower when pmulld is available!");
29672
29673 // Extract the odd parts.
29674 static const int UnpackMask[] = {1, 1, 3, 3};
29675 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
29676 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
29677
29678 // Multiply the even parts.
29679 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29680 DAG.getBitcast(MVT::v2i64, A),
29681 DAG.getBitcast(MVT::v2i64, B));
29682 // Now multiply odd parts.
29683 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29684 DAG.getBitcast(MVT::v2i64, Aodds),
29685 DAG.getBitcast(MVT::v2i64, Bodds));
29686
29687 Evens = DAG.getBitcast(VT, Evens);
29688 Odds = DAG.getBitcast(VT, Odds);
29689
29690 // Merge the two vectors back together with a shuffle. This expands into 2
29691 // shuffles.
29692 static const int ShufMask[] = { 0, 4, 2, 6 };
29693 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
29694 }
29695
29696 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
29697 "Only know how to lower V2I64/V4I64/V8I64 multiply");
29698 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
29699
29700 // Ahi = psrlqi(a, 32);
29701 // Bhi = psrlqi(b, 32);
29702 //
29703 // AloBlo = pmuludq(a, b);
29704 // AloBhi = pmuludq(a, Bhi);
29705 // AhiBlo = pmuludq(Ahi, b);
29706 //
29707 // Hi = psllqi(AloBhi + AhiBlo, 32);
29708 // return AloBlo + Hi;
29709 KnownBits AKnown = DAG.computeKnownBits(A);
29710 KnownBits BKnown = DAG.computeKnownBits(B);
29711
29712 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
29713 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
29714 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
29715
29716 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
29717 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
29718 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
29719
29720 SDValue Zero = DAG.getConstant(0, dl, VT);
29721
29722 // Only multiply lo/hi halves that aren't known to be zero.
29723 SDValue AloBlo = Zero;
29724 if (!ALoIsZero && !BLoIsZero)
29725 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
29726
29727 SDValue AloBhi = Zero;
29728 if (!ALoIsZero && !BHiIsZero) {
29729 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
29730 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
29731 }
29732
29733 SDValue AhiBlo = Zero;
29734 if (!AHiIsZero && !BLoIsZero) {
29735 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
29736 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
29737 }
29738
29739 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
29740 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
29741
29742 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
29743}
29744
29746 MVT VT, bool IsSigned,
29747 const X86Subtarget &Subtarget,
29748 SelectionDAG &DAG,
29749 SDValue *Low = nullptr) {
29750 unsigned NumElts = VT.getVectorNumElements();
29751
29752 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
29753 // to a vXi16 type. Do the multiplies, shift the results and pack the half
29754 // lane results back together.
29755
29756 // We'll take different approaches for signed and unsigned.
29757 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
29758 // and use pmullw to calculate the full 16-bit product.
29759 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
29760 // shift them left into the upper byte of each word. This allows us to use
29761 // pmulhw to calculate the full 16-bit product. This trick means we don't
29762 // need to sign extend the bytes to use pmullw.
29763
29764 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29765 SDValue Zero = DAG.getConstant(0, dl, VT);
29766
29767 SDValue ALo, AHi;
29768 if (IsSigned) {
29769 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
29770 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
29771 } else {
29772 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
29773 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
29774 }
29775
29776 SDValue BLo, BHi;
29777 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29778 // If the RHS is a constant, manually unpackl/unpackh and extend.
29779 SmallVector<SDValue, 16> LoOps, HiOps;
29780 for (unsigned i = 0; i != NumElts; i += 16) {
29781 for (unsigned j = 0; j != 8; ++j) {
29782 SDValue LoOp = B.getOperand(i + j);
29783 SDValue HiOp = B.getOperand(i + j + 8);
29784
29785 if (IsSigned) {
29786 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
29787 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
29788 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
29789 DAG.getConstant(8, dl, MVT::i16));
29790 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
29791 DAG.getConstant(8, dl, MVT::i16));
29792 } else {
29793 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
29794 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
29795 }
29796
29797 LoOps.push_back(LoOp);
29798 HiOps.push_back(HiOp);
29799 }
29800 }
29801
29802 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29803 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29804 } else if (IsSigned) {
29805 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
29806 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
29807 } else {
29808 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
29809 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
29810 }
29811
29812 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
29813 // pack back to vXi8.
29814 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
29815 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
29816 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
29817
29818 if (Low)
29819 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29820
29821 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
29822}
29823
29824static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
29825 SelectionDAG &DAG) {
29826 SDLoc dl(Op);
29827 MVT VT = Op.getSimpleValueType();
29828 bool IsSigned = Op->getOpcode() == ISD::MULHS;
29829 unsigned NumElts = VT.getVectorNumElements();
29830 SDValue A = Op.getOperand(0);
29831 SDValue B = Op.getOperand(1);
29832
29833 // Decompose 256-bit ops into 128-bit ops.
29834 if (VT.is256BitVector() && !Subtarget.hasInt256())
29835 return splitVectorIntBinary(Op, DAG, dl);
29836
29837 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29838 return splitVectorIntBinary(Op, DAG, dl);
29839
29840 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
29841 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
29842 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
29843 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
29844
29845 // PMULxD operations multiply each even value (starting at 0) of LHS with
29846 // the related value of RHS and produce a widen result.
29847 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29848 // => <2 x i64> <ae|cg>
29849 //
29850 // In other word, to have all the results, we need to perform two PMULxD:
29851 // 1. one with the even values.
29852 // 2. one with the odd values.
29853 // To achieve #2, with need to place the odd values at an even position.
29854 //
29855 // Place the odd value at an even position (basically, shift all values 1
29856 // step to the left):
29857 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
29858 9, -1, 11, -1, 13, -1, 15, -1};
29859 // <a|b|c|d> => <b|undef|d|undef>
29860 SDValue Odd0 =
29861 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
29862 // <e|f|g|h> => <f|undef|h|undef>
29863 SDValue Odd1 =
29864 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
29865
29866 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
29867 // ints.
29868 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
29869 unsigned Opcode =
29870 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
29871 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29872 // => <2 x i64> <ae|cg>
29873 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29874 DAG.getBitcast(MulVT, A),
29875 DAG.getBitcast(MulVT, B)));
29876 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
29877 // => <2 x i64> <bf|dh>
29878 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29879 DAG.getBitcast(MulVT, Odd0),
29880 DAG.getBitcast(MulVT, Odd1)));
29881
29882 // Shuffle it back into the right order.
29883 SmallVector<int, 16> ShufMask(NumElts);
29884 for (int i = 0; i != (int)NumElts; ++i)
29885 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
29886
29887 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
29888
29889 // If we have a signed multiply but no PMULDQ fix up the result of an
29890 // unsigned multiply.
29891 if (IsSigned && !Subtarget.hasSSE41()) {
29892 SDValue Zero = DAG.getConstant(0, dl, VT);
29893 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
29894 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
29895 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
29896 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
29897
29898 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
29899 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
29900 }
29901
29902 return Res;
29903 }
29904
29905 // Only i8 vectors should need custom lowering after this.
29906 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29907 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29908 "Unsupported vector type");
29909
29910 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
29911 // logical shift down the upper half and pack back to i8.
29912
29913 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
29914 // and then ashr/lshr the upper bits down to the lower bits before multiply.
29915
29916 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29917 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29918 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29919 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29920 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29921 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29922 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29923 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29924 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29925 }
29926
29927 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
29928}
29929
29930// Custom lowering for SMULO/UMULO.
29931static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
29932 SelectionDAG &DAG) {
29933 MVT VT = Op.getSimpleValueType();
29934
29935 // Scalars defer to LowerXALUO.
29936 if (!VT.isVector())
29937 return LowerXALUO(Op, DAG);
29938
29939 SDLoc dl(Op);
29940 bool IsSigned = Op->getOpcode() == ISD::SMULO;
29941 SDValue A = Op.getOperand(0);
29942 SDValue B = Op.getOperand(1);
29943 EVT OvfVT = Op->getValueType(1);
29944
29945 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
29946 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
29947 // Extract the LHS Lo/Hi vectors
29948 SDValue LHSLo, LHSHi;
29949 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
29950
29951 // Extract the RHS Lo/Hi vectors
29952 SDValue RHSLo, RHSHi;
29953 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
29954
29955 EVT LoOvfVT, HiOvfVT;
29956 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
29957 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
29958 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
29959
29960 // Issue the split operations.
29961 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
29962 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
29963
29964 // Join the separate data results and the overflow results.
29965 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29966 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
29967 Hi.getValue(1));
29968
29969 return DAG.getMergeValues({Res, Ovf}, dl);
29970 }
29971
29972 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29973 EVT SetccVT =
29974 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29975
29976 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29977 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29978 unsigned NumElts = VT.getVectorNumElements();
29979 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29980 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29981 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29982 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29983 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29984
29985 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29986
29987 SDValue Ovf;
29988 if (IsSigned) {
29989 SDValue High, LowSign;
29990 if (OvfVT.getVectorElementType() == MVT::i1 &&
29991 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29992 // Rather the truncating try to do the compare on vXi16 or vXi32.
29993 // Shift the high down filling with sign bits.
29994 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
29995 // Fill all 16 bits with the sign bit from the low.
29996 LowSign =
29997 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
29998 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
29999 15, DAG);
30000 SetccVT = OvfVT;
30001 if (!Subtarget.hasBWI()) {
30002 // We can't do a vXi16 compare so sign extend to v16i32.
30003 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
30004 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
30005 }
30006 } else {
30007 // Otherwise do the compare at vXi8.
30008 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30009 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30010 LowSign =
30011 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30012 }
30013
30014 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30015 } else {
30016 SDValue High =
30017 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30018 if (OvfVT.getVectorElementType() == MVT::i1 &&
30019 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
30020 // Rather the truncating try to do the compare on vXi16 or vXi32.
30021 SetccVT = OvfVT;
30022 if (!Subtarget.hasBWI()) {
30023 // We can't do a vXi16 compare so sign extend to v16i32.
30024 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
30025 }
30026 } else {
30027 // Otherwise do the compare at vXi8.
30028 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30029 }
30030
30031 Ovf =
30032 DAG.getSetCC(dl, SetccVT, High,
30033 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
30034 }
30035
30036 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30037
30038 return DAG.getMergeValues({Low, Ovf}, dl);
30039 }
30040
30041 SDValue Low;
30042 SDValue High =
30043 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
30044
30045 SDValue Ovf;
30046 if (IsSigned) {
30047 // SMULO overflows if the high bits don't match the sign of the low.
30048 SDValue LowSign =
30049 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30050 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30051 } else {
30052 // UMULO overflows if the high bits are non-zero.
30053 Ovf =
30054 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
30055 }
30056
30057 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30058
30059 return DAG.getMergeValues({Low, Ovf}, dl);
30060}
30061
30062SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
30063 assert(Subtarget.isTargetWin64() && "Unexpected target");
30064 EVT VT = Op.getValueType();
30065 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30066 "Unexpected return type for lowering");
30067
30068 if (isa<ConstantSDNode>(Op->getOperand(1))) {
30070 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
30071 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
30072 }
30073
30074 RTLIB::Libcall LC;
30075 bool isSigned;
30076 switch (Op->getOpcode()) {
30077 // clang-format off
30078 default: llvm_unreachable("Unexpected request for libcall!");
30079 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
30080 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
30081 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
30082 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
30083 // clang-format on
30084 }
30085
30086 SDLoc dl(Op);
30087 SDValue InChain = DAG.getEntryNode();
30088
30090 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
30091 EVT ArgVT = Op->getOperand(i).getValueType();
30092 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30093 "Unexpected argument type for lowering");
30094 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30095 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30096 MachinePointerInfo MPI =
30098 InChain =
30099 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
30100 Args.emplace_back(StackPtr, PointerType::get(*DAG.getContext(), 0));
30101 }
30102
30105
30106 TargetLowering::CallLoweringInfo CLI(DAG);
30107 CLI.setDebugLoc(dl)
30108 .setChain(InChain)
30109 .setLibCallee(
30111 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
30112 std::move(Args))
30113 .setInRegister()
30114 .setSExtResult(isSigned)
30115 .setZExtResult(!isSigned);
30116
30117 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
30118 return DAG.getBitcast(VT, CallInfo.first);
30119}
30120
30121SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
30122 SelectionDAG &DAG,
30123 SDValue &Chain) const {
30124 assert(Subtarget.isTargetWin64() && "Unexpected target");
30125 EVT VT = Op.getValueType();
30126 bool IsStrict = Op->isStrictFPOpcode();
30127
30128 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30129 EVT ArgVT = Arg.getValueType();
30130
30131 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30132 "Unexpected return type for lowering");
30133
30134 RTLIB::Libcall LC;
30135 if (Op->getOpcode() == ISD::FP_TO_SINT ||
30136 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
30137 LC = RTLIB::getFPTOSINT(ArgVT, VT);
30138 else
30139 LC = RTLIB::getFPTOUINT(ArgVT, VT);
30140 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30141
30142 SDLoc dl(Op);
30143 MakeLibCallOptions CallOptions;
30144 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30145
30147 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
30148 // expected VT (i128).
30149 std::tie(Result, Chain) =
30150 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
30151 Result = DAG.getBitcast(VT, Result);
30152 return Result;
30153}
30154
30155SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
30156 SelectionDAG &DAG) const {
30157 assert(Subtarget.isTargetWin64() && "Unexpected target");
30158 EVT VT = Op.getValueType();
30159 bool IsStrict = Op->isStrictFPOpcode();
30160
30161 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30162 EVT ArgVT = Arg.getValueType();
30163
30164 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30165 "Unexpected argument type for lowering");
30166
30167 RTLIB::Libcall LC;
30168 if (Op->getOpcode() == ISD::SINT_TO_FP ||
30169 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
30170 LC = RTLIB::getSINTTOFP(ArgVT, VT);
30171 else
30172 LC = RTLIB::getUINTTOFP(ArgVT, VT);
30173 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30174
30175 SDLoc dl(Op);
30176 MakeLibCallOptions CallOptions;
30177 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30178
30179 // Pass the i128 argument as an indirect argument on the stack.
30180 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30181 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30182 MachinePointerInfo MPI =
30184 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
30185
30187 std::tie(Result, Chain) =
30188 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
30189 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
30190}
30191
30192// Return true if the required (according to Opcode) shift-imm form is natively
30193// supported by the Subtarget
30194static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
30195 unsigned Opcode) {
30196 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30197 "Unexpected shift opcode");
30198
30199 if (!VT.isSimple())
30200 return false;
30201
30202 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30203 return false;
30204
30205 if (VT.getScalarSizeInBits() < 16)
30206 return false;
30207
30208 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
30209 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
30210 return true;
30211
30212 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
30213 (VT.is256BitVector() && Subtarget.hasInt256());
30214
30215 bool AShift = LShift && (Subtarget.hasAVX512() ||
30216 (VT != MVT::v2i64 && VT != MVT::v4i64));
30217 return (Opcode == ISD::SRA) ? AShift : LShift;
30218}
30219
30220// The shift amount is a variable, but it is the same for all vector lanes.
30221// These instructions are defined together with shift-immediate.
30222static
30224 unsigned Opcode) {
30225 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
30226}
30227
30228// Return true if the required (according to Opcode) variable-shift form is
30229// natively supported by the Subtarget
30230static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
30231 unsigned Opcode) {
30232 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30233 "Unexpected shift opcode");
30234
30235 if (!VT.isSimple())
30236 return false;
30237
30238 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30239 return false;
30240
30241 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
30242 return false;
30243
30244 // vXi16 supported only on AVX-512, BWI
30245 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
30246 return false;
30247
30248 if (Subtarget.hasAVX512() &&
30249 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
30250 return true;
30251
30252 bool LShift = VT.is128BitVector() || VT.is256BitVector();
30253 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
30254 return (Opcode == ISD::SRA) ? AShift : LShift;
30255}
30256
30258 const X86Subtarget &Subtarget) {
30259 MVT VT = Op.getSimpleValueType();
30260 SDLoc dl(Op);
30261 SDValue R = Op.getOperand(0);
30262 SDValue Amt = Op.getOperand(1);
30263 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
30264 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30265
30266 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
30267 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
30268 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
30269 SDValue Ex = DAG.getBitcast(ExVT, R);
30270
30271 // ashr(R, 63) === cmp_slt(R, 0)
30272 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
30273 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
30274 "Unsupported PCMPGT op");
30275 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
30276 }
30277
30278 if (ShiftAmt >= 32) {
30279 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
30280 SDValue Upper =
30281 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
30283 ShiftAmt - 32, DAG);
30284 if (VT == MVT::v2i64)
30285 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
30286 if (VT == MVT::v4i64)
30287 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30288 {9, 1, 11, 3, 13, 5, 15, 7});
30289 } else {
30290 // SRA upper i32, SRL whole i64 and select lower i32.
30292 ShiftAmt, DAG);
30293 SDValue Lower =
30294 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
30295 Lower = DAG.getBitcast(ExVT, Lower);
30296 if (VT == MVT::v2i64)
30297 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
30298 if (VT == MVT::v4i64)
30299 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30300 {8, 1, 10, 3, 12, 5, 14, 7});
30301 }
30302 return DAG.getBitcast(VT, Ex);
30303 };
30304
30305 // Optimize shl/srl/sra with constant shift amount.
30306 APInt APIntShiftAmt;
30307 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
30308 return SDValue();
30309
30310 // If the shift amount is out of range, return undef.
30311 if (APIntShiftAmt.uge(EltSizeInBits))
30312 return DAG.getUNDEF(VT);
30313
30314 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
30315
30316 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
30317 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
30318
30319 // i64 SRA needs to be performed as partial shifts.
30320 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
30321 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
30322 Op.getOpcode() == ISD::SRA)
30323 return ArithmeticShiftRight64(ShiftAmt);
30324
30325 // If we're logical shifting an all-signbits value then we can just perform as
30326 // a mask.
30327 if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
30328 DAG.ComputeNumSignBits(R) == EltSizeInBits) {
30329 SDValue Mask = DAG.getAllOnesConstant(dl, VT);
30330 Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt);
30331 return DAG.getNode(ISD::AND, dl, VT, R, Mask);
30332 }
30333
30334 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30335 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
30336 unsigned NumElts = VT.getVectorNumElements();
30337 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30338
30339 // Simple i8 add case
30340 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30341 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30342 // must be 0). (add undef, undef) however can be any value. To make this
30343 // safe, we must freeze R to ensure that register allocation uses the same
30344 // register for an undefined value. This ensures that the result will
30345 // still be even and preserves the original semantics.
30346 R = DAG.getFreeze(R);
30347 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30348 }
30349
30350 // ashr(R, 7) === cmp_slt(R, 0)
30351 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
30352 SDValue Zeros = DAG.getConstant(0, dl, VT);
30353 if (VT.is512BitVector()) {
30354 assert(VT == MVT::v64i8 && "Unexpected element type!");
30355 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
30356 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
30357 }
30358 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
30359 }
30360
30361 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
30362 if (VT == MVT::v16i8 && Subtarget.hasXOP())
30363 return SDValue();
30364
30365 if (Subtarget.hasGFNI()) {
30366 SDValue Mask = getGFNICtrlMask(Op.getOpcode(), DAG, dl, VT, ShiftAmt);
30367 return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
30368 DAG.getTargetConstant(0, dl, MVT::i8));
30369 }
30370
30371 if (Op.getOpcode() == ISD::SHL) {
30372 // Make a large shift.
30373 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
30374 ShiftAmt, DAG);
30375 SHL = DAG.getBitcast(VT, SHL);
30376 // Zero out the rightmost bits.
30377 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
30378 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
30379 }
30380 if (Op.getOpcode() == ISD::SRL) {
30381 // Make a large shift.
30382 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
30383 ShiftAmt, DAG);
30384 SRL = DAG.getBitcast(VT, SRL);
30385 // Zero out the leftmost bits.
30386 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
30387 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
30388 }
30389 if (Op.getOpcode() == ISD::SRA) {
30390 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
30391 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30392
30393 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
30394 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
30395 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
30396 return Res;
30397 }
30398 llvm_unreachable("Unknown shift opcode.");
30399 }
30400
30401 return SDValue();
30402}
30403
30405 const X86Subtarget &Subtarget) {
30406 MVT VT = Op.getSimpleValueType();
30407 SDLoc dl(Op);
30408 SDValue R = Op.getOperand(0);
30409 SDValue Amt = Op.getOperand(1);
30410 unsigned Opcode = Op.getOpcode();
30411 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
30412
30413 int BaseShAmtIdx = -1;
30414 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
30415 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
30416 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
30417 Subtarget, DAG);
30418
30419 // vXi8 shifts - shift as v8i16 + mask result.
30420 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
30421 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
30422 VT == MVT::v64i8) &&
30423 !Subtarget.hasXOP()) {
30424 unsigned NumElts = VT.getVectorNumElements();
30425 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30426 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
30427 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
30428 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
30429
30430 // Create the mask using vXi16 shifts. For shift-rights we need to move
30431 // the upper byte down before splatting the vXi8 mask.
30432 SDValue BitMask = DAG.getAllOnesConstant(dl, ExtVT);
30433 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
30434 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
30435 if (Opcode != ISD::SHL)
30436 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
30437 8, DAG);
30438 BitMask = DAG.getBitcast(VT, BitMask);
30439 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
30440 SmallVector<int, 64>(NumElts, 0));
30441
30442 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
30443 DAG.getBitcast(ExtVT, R), BaseShAmt,
30444 BaseShAmtIdx, Subtarget, DAG);
30445 Res = DAG.getBitcast(VT, Res);
30446 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
30447
30448 if (Opcode == ISD::SRA) {
30449 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
30450 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
30451 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
30452 SignMask =
30453 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
30454 BaseShAmtIdx, Subtarget, DAG);
30455 SignMask = DAG.getBitcast(VT, SignMask);
30456 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
30457 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
30458 }
30459 return Res;
30460 }
30461 }
30462 }
30463
30464 return SDValue();
30465}
30466
30467// Convert a shift/rotate left amount to a multiplication scale factor.
30469 const X86Subtarget &Subtarget,
30470 SelectionDAG &DAG) {
30471 MVT VT = Amt.getSimpleValueType();
30472 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
30473 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
30474 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
30475 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
30476 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30477 (Subtarget.hasBWI() && VT == MVT::v64i8)))
30478 return SDValue();
30479
30480 MVT SVT = VT.getVectorElementType();
30481 unsigned SVTBits = SVT.getSizeInBits();
30482 unsigned NumElems = VT.getVectorNumElements();
30483
30484 APInt UndefElts;
30485 SmallVector<APInt> EltBits;
30486 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
30487 APInt One(SVTBits, 1);
30488 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
30489 for (unsigned I = 0; I != NumElems; ++I) {
30490 if (UndefElts[I] || EltBits[I].uge(SVTBits))
30491 continue;
30492 uint64_t ShAmt = EltBits[I].getZExtValue();
30493 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
30494 }
30495 return DAG.getBuildVector(VT, dl, Elts);
30496 }
30497
30498 // If the target doesn't support variable shifts, use either FP conversion
30499 // or integer multiplication to avoid shifting each element individually.
30500 if (VT == MVT::v4i32) {
30501 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
30502 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
30503 DAG.getConstant(0x3f800000U, dl, VT));
30504 Amt = DAG.getBitcast(MVT::v4f32, Amt);
30505 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
30506 }
30507
30508 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
30509 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
30510 SDValue Z = DAG.getConstant(0, dl, VT);
30511 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
30512 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
30513 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
30514 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
30515 if (Subtarget.hasSSE41())
30516 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
30517 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
30518 }
30519
30520 return SDValue();
30521}
30522
30523static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
30524 SelectionDAG &DAG) {
30525 MVT VT = Op.getSimpleValueType();
30526 SDLoc dl(Op);
30527 SDValue R = Op.getOperand(0);
30528 SDValue Amt = Op.getOperand(1);
30529 unsigned NumElts = VT.getVectorNumElements();
30530 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30531 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30532
30533 unsigned Opc = Op.getOpcode();
30534 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
30535 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
30536
30537 assert(VT.isVector() && "Custom lowering only for vector shifts!");
30538 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
30539
30540 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
30541 return V;
30542
30543 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
30544 return V;
30545
30546 if (supportedVectorVarShift(VT, Subtarget, Opc))
30547 return Op;
30548
30549 // i64 vector arithmetic shift can be emulated with the transform:
30550 // M = lshr(SIGN_MASK, Amt)
30551 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
30552 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
30553 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
30554 Opc == ISD::SRA) {
30555 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
30556 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
30557 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30558 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
30559 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
30560 return R;
30561 }
30562
30563 // XOP has 128-bit variable logical/arithmetic shifts.
30564 // +ve/-ve Amt = shift left/right.
30565 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
30566 VT == MVT::v8i16 || VT == MVT::v16i8)) {
30567 if (Opc == ISD::SRL || Opc == ISD::SRA)
30568 Amt = DAG.getNegative(Amt, dl, VT);
30569 if (Opc == ISD::SHL || Opc == ISD::SRL)
30570 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
30571 if (Opc == ISD::SRA)
30572 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
30573 }
30574
30575 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
30576 // shifts per-lane and then shuffle the partial results back together.
30577 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
30578 // Splat the shift amounts so the scalar shifts above will catch it.
30579 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
30580 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
30581 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
30582 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
30583 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
30584 }
30585
30586 // Build a map of inrange constant amounts with element mask where they occur.
30588 if (ConstantAmt) {
30589 for (unsigned I = 0; I != NumElts; ++I) {
30590 SDValue A = Amt.getOperand(I);
30591 if (A.isUndef() || A->getAsAPIntVal().uge(EltSizeInBits))
30592 continue;
30593 unsigned CstAmt = A->getAsAPIntVal().getZExtValue();
30594 auto [It, Inserted] = UniqueCstAmt.try_emplace(CstAmt);
30595 if (!Inserted) {
30596 It->second.setBit(I);
30597 continue;
30598 }
30599 It->second = APInt::getOneBitSet(NumElts, I);
30600 }
30601 assert(!UniqueCstAmt.empty() && "Illegal constant shift amounts");
30602 }
30603
30604 // If possible, lower this shift as a sequence of two shifts by
30605 // constant plus a BLENDing shuffle instead of scalarizing it.
30606 // Example:
30607 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
30608 //
30609 // Could be rewritten as:
30610 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
30611 //
30612 // The advantage is that the two shifts from the example would be
30613 // lowered as X86ISD::VSRLI nodes in parallel before blending.
30614 if (UniqueCstAmt.size() == 2 &&
30615 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
30616 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30617 unsigned AmtA = UniqueCstAmt.begin()->first;
30618 unsigned AmtB = std::next(UniqueCstAmt.begin())->first;
30619 const APInt &MaskA = UniqueCstAmt.begin()->second;
30620 const APInt &MaskB = std::next(UniqueCstAmt.begin())->second;
30621 SmallVector<int, 8> ShuffleMask(NumElts, SM_SentinelUndef);
30622 for (unsigned I = 0; I != NumElts; ++I) {
30623 if (MaskA[I])
30624 ShuffleMask[I] = I;
30625 if (MaskB[I])
30626 ShuffleMask[I] = I + NumElts;
30627 }
30628
30629 // Only perform this blend if we can perform it without loading a mask.
30630 if ((VT != MVT::v16i16 ||
30631 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
30632 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
30633 canWidenShuffleElements(ShuffleMask))) {
30634 SDValue Shift1 =
30635 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtA, dl, VT));
30636 SDValue Shift2 =
30637 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtB, dl, VT));
30638 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
30639 }
30640 }
30641
30642 // Constant ISD::SRA/SRL/SHL can be performed efficiently on vXiN vectors by
30643 // using vYiM vector operations where X*N == Y*M and M > N.
30644 if (ConstantAmt &&
30645 (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
30646 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16) &&
30647 !Subtarget.hasXOP()) {
30648 MVT NarrowScalarVT = VT.getScalarType();
30649 // We can do this extra fast if each pair of narrow elements is shifted by
30650 // the same amount by doing this SWAR style: use a shift to move the valid
30651 // bits to the right position, mask out any bits which crossed from one
30652 // element to the other.
30653 // This optimized lowering is only valid if the elements in a pair can
30654 // be treated identically.
30655 SmallVector<SDValue, 32> AmtWideElts(Amt->ops());
30656 SmallVector<SDValue, 32> TmpAmtWideElts;
30657 int WideEltSizeInBits = EltSizeInBits;
30658 while (WideEltSizeInBits < 32) {
30659 // AVX1 does not have psrlvd, etc. which makes interesting 32-bit shifts
30660 // unprofitable.
30661 if (WideEltSizeInBits >= 16 && !Subtarget.hasAVX2()) {
30662 break;
30663 }
30664 TmpAmtWideElts.resize(AmtWideElts.size() / 2);
30665 bool SameShifts = true;
30666 for (unsigned SrcI = 0, E = AmtWideElts.size(); SrcI != E; SrcI += 2) {
30667 unsigned DstI = SrcI / 2;
30668 // Both elements are undef? Make a note and keep going.
30669 if (AmtWideElts[SrcI].isUndef() && AmtWideElts[SrcI + 1].isUndef()) {
30670 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30671 continue;
30672 }
30673 // Even element is undef? We will shift it by the same shift amount as
30674 // the odd element.
30675 if (AmtWideElts[SrcI].isUndef()) {
30676 TmpAmtWideElts[DstI] = AmtWideElts[SrcI + 1];
30677 continue;
30678 }
30679 // Odd element is undef? We will shift it by the same shift amount as
30680 // the even element.
30681 if (AmtWideElts[SrcI + 1].isUndef()) {
30682 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30683 continue;
30684 }
30685 // Both elements are equal.
30686 if (AmtWideElts[SrcI].getNode()->getAsAPIntVal() ==
30687 AmtWideElts[SrcI + 1].getNode()->getAsAPIntVal()) {
30688 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30689 continue;
30690 }
30691 // One of the provisional wide elements will not have the same shift
30692 // amount. Let's bail.
30693 SameShifts = false;
30694 break;
30695 }
30696 if (!SameShifts) {
30697 break;
30698 }
30699 WideEltSizeInBits *= 2;
30700 std::swap(TmpAmtWideElts, AmtWideElts);
30701 }
30702 APInt APIntShiftAmt;
30703 bool IsConstantSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
30704 bool Profitable = WidenShift;
30705 // AVX512BW brings support for vpsllvw.
30706 if (WideEltSizeInBits * AmtWideElts.size() >= 512 &&
30707 WideEltSizeInBits < 32 && !Subtarget.hasBWI()) {
30708 Profitable = false;
30709 }
30710 // Leave AVX512 uniform arithmetic shifts alone, they can be implemented
30711 // fairly cheaply in other ways.
30712 if (WideEltSizeInBits * AmtWideElts.size() >= 512 && IsConstantSplat) {
30713 Profitable = false;
30714 }
30715 // Leave it up to GFNI if we have it around.
30716 // TODO: gf2p8affine is usually higher latency and more port restricted. It
30717 // is probably a win to use other strategies in some cases.
30718 if (EltSizeInBits == 8 && Subtarget.hasGFNI()) {
30719 Profitable = false;
30720 }
30721
30722 // AVX1 does not have vpand which makes our masking impractical. It does
30723 // have vandps but that is an FP instruction and crossing FP<->int typically
30724 // has some cost.
30725 if (WideEltSizeInBits * AmtWideElts.size() >= 256 &&
30726 (WideEltSizeInBits < 32 || IsConstantSplat) && !Subtarget.hasAVX2()) {
30727 Profitable = false;
30728 }
30729 unsigned WideNumElts = AmtWideElts.size();
30730 // We are only dealing with identical pairs.
30731 if (Profitable && WideNumElts != NumElts) {
30732 MVT WideScalarVT = MVT::getIntegerVT(WideEltSizeInBits);
30733 MVT WideVT = MVT::getVectorVT(WideScalarVT, WideNumElts);
30734 // Cast the operand to vXiM.
30735 SDValue RWide = DAG.getBitcast(WideVT, R);
30736 // Create our new vector of shift amounts.
30737 SDValue AmtWide = DAG.getBuildVector(
30738 MVT::getVectorVT(NarrowScalarVT, WideNumElts), dl, AmtWideElts);
30739 AmtWide = DAG.getZExtOrTrunc(AmtWide, dl, WideVT);
30740 // Perform the actual shift.
30741 unsigned LogicalOpc = Opc == ISD::SRA ? (unsigned)ISD::SRL : Opc;
30742 SDValue ShiftedR = DAG.getNode(LogicalOpc, dl, WideVT, RWide, AmtWide);
30743 // Now we need to construct a mask which will "drop" bits that get
30744 // shifted past the LSB/MSB. For a logical shift left, it will look
30745 // like:
30746 // FullMask = (1 << EltSizeInBits) - 1
30747 // Mask = FullMask << Amt
30748 //
30749 // This masking ensures that bits cannot migrate from one narrow lane to
30750 // another. The construction of this mask will be constant folded.
30751 // The mask for a logical right shift is nearly identical, the only
30752 // difference is that the all ones mask is shifted right instead of left.
30753 SDValue SplatFullMask = DAG.getAllOnesConstant(dl, VT);
30754 SDValue Mask = DAG.getNode(LogicalOpc, dl, VT, SplatFullMask, Amt);
30755 Mask = DAG.getBitcast(WideVT, Mask);
30756 // Finally, we mask the shifted vector with the SWAR mask.
30757 SDValue Masked = DAG.getNode(ISD::AND, dl, WideVT, ShiftedR, Mask);
30758 Masked = DAG.getBitcast(VT, Masked);
30759 if (Opc != ISD::SRA) {
30760 // Logical shifts are complete at this point.
30761 return Masked;
30762 }
30763 // At this point, we have done a *logical* shift right. We now need to
30764 // sign extend the result so that we get behavior equivalent to an
30765 // arithmetic shift right. Post-shifting by AmtWide, our narrow elements
30766 // are `EltSizeInBits-AmtWide` bits wide.
30767 //
30768 // To convert our `EltSizeInBits-AmtWide` bit unsigned numbers to signed
30769 // numbers as wide as `EltSizeInBits`, we need to replicate the bit at
30770 // position `EltSizeInBits-AmtWide` into the MSBs of each narrow lane. We
30771 // can use the following trick to accomplish this:
30772 // SignBitMask = 1 << (EltSizeInBits-AmtWide-1)
30773 // (Masked ^ SignBitMask) - SignBitMask
30774 //
30775 // When the sign bit is already clear, this will compute:
30776 // Masked + SignBitMask - SignBitMask
30777 //
30778 // This is equal to Masked which is what we want: the sign bit was clear
30779 // so sign extending should be a no-op.
30780 //
30781 // When the sign bit is set, this will compute:
30782 // Masked - SignBitmask - SignBitMask
30783 //
30784 // This is equal to Masked - 2*SignBitMask which will correctly sign
30785 // extend our result.
30786 SDValue SplatHighBit =
30787 DAG.getConstant(APInt::getSignMask(EltSizeInBits), dl, VT);
30788 // This does not induce recursion, all operands are constants.
30789 SDValue SignBitMask = DAG.getNode(LogicalOpc, dl, VT, SplatHighBit, Amt);
30790 SDValue FlippedSignBit =
30791 DAG.getNode(ISD::XOR, dl, VT, Masked, SignBitMask);
30792 SDValue Subtraction =
30793 DAG.getNode(ISD::SUB, dl, VT, FlippedSignBit, SignBitMask);
30794 return Subtraction;
30795 }
30796 }
30797
30798 // If possible, lower this packed shift into a vector multiply instead of
30799 // expanding it into a sequence of scalar shifts.
30800 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
30801 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
30802 Subtarget.canExtendTo512BW())))
30803 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
30804 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
30805
30806 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
30807 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
30808 if (Opc == ISD::SRL && ConstantAmt &&
30809 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30810 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30811 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30812 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30813 SDValue Zero = DAG.getConstant(0, dl, VT);
30814 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
30815 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
30816 return DAG.getSelect(dl, VT, ZAmt, R, Res);
30817 }
30818 }
30819
30820 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
30821 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
30822 // TODO: Special case handling for shift by 0/1, really we can afford either
30823 // of these cases in pre-SSE41/XOP/AVX512 but not both.
30824 if (Opc == ISD::SRA && ConstantAmt &&
30825 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
30826 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
30827 !Subtarget.hasAVX512()) ||
30828 DAG.isKnownNeverZero(Amt))) {
30829 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30830 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30831 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30832 SDValue Amt0 =
30833 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
30834 SDValue Amt1 =
30835 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
30836 SDValue Sra1 =
30837 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
30838 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
30839 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
30840 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
30841 }
30842 }
30843
30844 // v4i32 Non Uniform Shifts.
30845 // If the shift amount is constant we can shift each lane using the SSE2
30846 // immediate shifts, else we need to zero-extend each lane to the lower i64
30847 // and shift using the SSE2 variable shifts.
30848 // The separate results can then be blended together.
30849 if (VT == MVT::v4i32) {
30850 SDValue Amt0, Amt1, Amt2, Amt3;
30851 if (ConstantAmt) {
30852 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
30853 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
30854 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
30855 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
30856 } else {
30857 // The SSE2 shifts use the lower i64 as the same shift amount for
30858 // all lanes and the upper i64 is ignored. On AVX we're better off
30859 // just zero-extending, but for SSE just duplicating the top 16-bits is
30860 // cheaper and has the same effect for out of range values.
30861 if (Subtarget.hasAVX()) {
30862 SDValue Z = DAG.getConstant(0, dl, VT);
30863 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
30864 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
30865 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
30866 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
30867 } else {
30868 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
30869 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
30870 {4, 5, 6, 7, -1, -1, -1, -1});
30871 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
30872 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
30873 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
30874 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
30875 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
30876 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
30877 }
30878 }
30879
30880 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
30881 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
30882 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
30883 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
30884 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
30885
30886 // Merge the shifted lane results optimally with/without PBLENDW.
30887 // TODO - ideally shuffle combining would handle this.
30888 if (Subtarget.hasSSE41()) {
30889 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
30890 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
30891 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
30892 }
30893 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
30894 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
30895 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
30896 }
30897
30898 // If we're shifting (per-lane) uniform vXi8 constants, we can use PSHUFB to
30899 // look up the pre-computed shift values.
30900 if ((VT == MVT::v16i8 && Subtarget.hasSSSE3()) ||
30901 (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30902 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30903 unsigned NumLanes = VT.getSizeInBits() / 128u;
30904 unsigned NumEltsPerLane = NumElts / NumLanes;
30906 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
30907 unsigned LoElt = Lane * NumEltsPerLane;
30908 APInt EltMask = APInt::getBitsSet(NumElts, LoElt, LoElt + NumEltsPerLane);
30909 KnownBits KnownLane = DAG.computeKnownBits(R, EltMask);
30910 if (!KnownLane.isConstant())
30911 break;
30912 const APInt &LaneSplat = KnownLane.getConstant();
30913 for (unsigned I = 0; I != 8; ++I) {
30914 if (Opc == ISD::SHL)
30915 LUT.push_back(LaneSplat.shl(I));
30916 else if (Opc == ISD::SRL)
30917 LUT.push_back(LaneSplat.lshr(I));
30918 else if (Opc == ISD::SRA)
30919 LUT.push_back(LaneSplat.ashr(I));
30920 }
30921 LUT.append(8, APInt::getZero(8));
30922 }
30923 if (LUT.size() == NumElts) {
30924 APInt Undefs = APInt::getSplat(NumElts, APInt(16, 0xFF00));
30925 SDValue Mask = getConstVector(LUT, Undefs, VT, DAG, dl);
30926 return DAG.getNode(X86ISD::PSHUFB, dl, VT, Mask, Amt);
30927 }
30928 }
30929
30930 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
30931 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
30932 // make the existing SSE solution better.
30933 // NOTE: We honor prefered vector width before promoting to 512-bits.
30934 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
30935 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
30936 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
30937 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
30938 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
30939 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
30940 "Unexpected vector type");
30941 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
30942 MVT ExtVT = MVT::getVectorVT(EvtSVT, NumElts);
30943 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30944 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
30945 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
30946 return DAG.getNode(ISD::TRUNCATE, dl, VT,
30947 DAG.getNode(Opc, dl, ExtVT, R, Amt));
30948 }
30949
30950 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
30951 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
30952 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
30953 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30954 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
30955 !Subtarget.hasXOP()) {
30956 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
30957 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
30958
30959 // Extend constant shift amount to vXi16 (it doesn't matter if the type
30960 // isn't legal).
30961 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30962 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
30963 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
30964 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
30966 "Constant build vector expected");
30967
30968 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
30969 bool IsSigned = Opc == ISD::SRA;
30970 R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
30971 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
30972 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
30973 return DAG.getZExtOrTrunc(R, dl, VT);
30974 }
30975
30976 SmallVector<SDValue, 16> LoAmt, HiAmt;
30977 for (unsigned i = 0; i != NumElts; i += 16) {
30978 for (int j = 0; j != 8; ++j) {
30979 LoAmt.push_back(Amt.getOperand(i + j));
30980 HiAmt.push_back(Amt.getOperand(i + j + 8));
30981 }
30982 }
30983
30984 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
30985 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
30986
30987 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
30988 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
30989 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
30990 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
30991 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
30992 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
30993 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
30994 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
30995 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
30996 }
30997
30998 if (VT == MVT::v16i8 ||
30999 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
31000 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
31001 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31002
31003 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31004 if (VT.is512BitVector()) {
31005 // On AVX512BW targets we make use of the fact that VSELECT lowers
31006 // to a masked blend which selects bytes based just on the sign bit
31007 // extracted to a mask.
31008 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
31009 V0 = DAG.getBitcast(VT, V0);
31010 V1 = DAG.getBitcast(VT, V1);
31011 Sel = DAG.getBitcast(VT, Sel);
31012 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
31013 ISD::SETGT);
31014 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
31015 } else if (Subtarget.hasSSE41()) {
31016 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31017 // on the sign bit.
31018 V0 = DAG.getBitcast(VT, V0);
31019 V1 = DAG.getBitcast(VT, V1);
31020 Sel = DAG.getBitcast(VT, Sel);
31021 return DAG.getBitcast(SelVT,
31022 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
31023 }
31024 // On pre-SSE41 targets we test for the sign bit by comparing to
31025 // zero - a negative value will set all bits of the lanes to true
31026 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31027 SDValue Z = DAG.getConstant(0, dl, SelVT);
31028 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
31029 return DAG.getSelect(dl, SelVT, C, V0, V1);
31030 };
31031
31032 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31033 // We can safely do this using i16 shifts as we're only interested in
31034 // the 3 lower bits of each byte.
31035 Amt = DAG.getBitcast(ExtVT, Amt);
31036 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
31037 Amt = DAG.getBitcast(VT, Amt);
31038
31039 if (Opc == ISD::SHL || Opc == ISD::SRL) {
31040 // r = VSELECT(r, shift(r, 4), a);
31041 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
31042 R = SignBitSelect(VT, Amt, M, R);
31043
31044 // a += a
31045 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31046
31047 // r = VSELECT(r, shift(r, 2), a);
31048 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
31049 R = SignBitSelect(VT, Amt, M, R);
31050
31051 // a += a
31052 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31053
31054 // return VSELECT(r, shift(r, 1), a);
31055 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
31056 R = SignBitSelect(VT, Amt, M, R);
31057 return R;
31058 }
31059
31060 if (Opc == ISD::SRA) {
31061 // For SRA we need to unpack each byte to the higher byte of a i16 vector
31062 // so we can correctly sign extend. We don't care what happens to the
31063 // lower byte.
31064 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31065 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31066 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
31067 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
31068 ALo = DAG.getBitcast(ExtVT, ALo);
31069 AHi = DAG.getBitcast(ExtVT, AHi);
31070 RLo = DAG.getBitcast(ExtVT, RLo);
31071 RHi = DAG.getBitcast(ExtVT, RHi);
31072
31073 // r = VSELECT(r, shift(r, 4), a);
31074 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
31075 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
31076 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31077 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31078
31079 // a += a
31080 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31081 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31082
31083 // r = VSELECT(r, shift(r, 2), a);
31084 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
31085 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
31086 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31087 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31088
31089 // a += a
31090 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31091 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31092
31093 // r = VSELECT(r, shift(r, 1), a);
31094 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
31095 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
31096 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31097 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31098
31099 // Logical shift the result back to the lower byte, leaving a zero upper
31100 // byte meaning that we can safely pack with PACKUSWB.
31101 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
31102 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
31103 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
31104 }
31105 }
31106
31107 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
31108 MVT ExtVT = MVT::v8i32;
31109 SDValue Z = DAG.getConstant(0, dl, VT);
31110 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
31111 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
31112 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
31113 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
31114 ALo = DAG.getBitcast(ExtVT, ALo);
31115 AHi = DAG.getBitcast(ExtVT, AHi);
31116 RLo = DAG.getBitcast(ExtVT, RLo);
31117 RHi = DAG.getBitcast(ExtVT, RHi);
31118 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
31119 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
31120 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
31121 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
31122 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
31123 }
31124
31125 if (VT == MVT::v8i16) {
31126 // If we have a constant shift amount, the non-SSE41 path is best as
31127 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
31128 bool UseSSE41 = Subtarget.hasSSE41() &&
31130
31131 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
31132 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
31133 // the sign bit.
31134 if (UseSSE41) {
31135 MVT ExtVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
31136 V0 = DAG.getBitcast(ExtVT, V0);
31137 V1 = DAG.getBitcast(ExtVT, V1);
31138 Sel = DAG.getBitcast(ExtVT, Sel);
31139 return DAG.getBitcast(
31140 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
31141 }
31142 // On pre-SSE41 targets we splat the sign bit - a negative value will
31143 // set all bits of the lanes to true and VSELECT uses that in
31144 // its OR(AND(V0,C),AND(V1,~C)) lowering.
31145 SDValue C =
31146 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
31147 return DAG.getSelect(dl, VT, C, V0, V1);
31148 };
31149
31150 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
31151 if (UseSSE41) {
31152 // On SSE41 targets we need to replicate the shift mask in both
31153 // bytes for PBLENDVB.
31154 Amt = DAG.getNode(
31155 ISD::OR, dl, VT,
31156 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
31157 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
31158 } else {
31159 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
31160 }
31161
31162 // r = VSELECT(r, shift(r, 8), a);
31163 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
31164 R = SignBitSelect(Amt, M, R);
31165
31166 // a += a
31167 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31168
31169 // r = VSELECT(r, shift(r, 4), a);
31170 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
31171 R = SignBitSelect(Amt, M, R);
31172
31173 // a += a
31174 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31175
31176 // r = VSELECT(r, shift(r, 2), a);
31177 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
31178 R = SignBitSelect(Amt, M, R);
31179
31180 // a += a
31181 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31182
31183 // return VSELECT(r, shift(r, 1), a);
31184 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
31185 R = SignBitSelect(Amt, M, R);
31186 return R;
31187 }
31188
31189 // Decompose 256-bit shifts into 128-bit shifts.
31190 if (VT.is256BitVector())
31191 return splitVectorIntBinary(Op, DAG, dl);
31192
31193 if (VT == MVT::v32i16 || VT == MVT::v64i8)
31194 return splitVectorIntBinary(Op, DAG, dl);
31195
31196 return SDValue();
31197}
31198
31200 SelectionDAG &DAG) {
31201 MVT VT = Op.getSimpleValueType();
31202 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
31203 "Unexpected funnel shift opcode!");
31204
31205 SDLoc DL(Op);
31206 SDValue Op0 = Op.getOperand(0);
31207 SDValue Op1 = Op.getOperand(1);
31208 SDValue Amt = Op.getOperand(2);
31209 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31210 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
31211
31212 if (VT.isVector()) {
31213 APInt APIntShiftAmt;
31214 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
31215 unsigned NumElts = VT.getVectorNumElements();
31216
31217 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
31218
31219 if (IsCstSplat) {
31220 if (IsFSHR)
31221 std::swap(Op0, Op1);
31222 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31223 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
31224 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
31225 {Op0, Op1, Imm}, DAG, Subtarget);
31226 }
31227 return getAVX512Node(IsFSHR ? ISD::FSHR : ISD::FSHL, DL, VT,
31228 {Op0, Op1, Amt}, DAG, Subtarget);
31229 }
31230 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
31231 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
31232 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
31233 "Unexpected funnel shift type!");
31234
31235 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
31236 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
31237 if (IsCstSplat) {
31238 // TODO: Can't use generic expansion as UNDEF amt elements can be
31239 // converted to other values when folded to shift amounts, losing the
31240 // splat.
31241 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31242 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
31243 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
31244 assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
31245 MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31246
31247 if (EltSizeInBits == 8 &&
31248 (Subtarget.hasXOP() ||
31249 (useVPTERNLOG(Subtarget, VT) &&
31250 supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) {
31251 // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
31252 // bit-select - lower using vXi16 shifts and then perform the bitmask at
31253 // the original vector width to handle cases where we split.
31254 APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
31255 APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
31256 SDValue ShX =
31257 DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0),
31258 DAG.getShiftAmountConstant(ShXAmt, WideVT, DL));
31259 SDValue ShY =
31260 DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1),
31261 DAG.getShiftAmountConstant(ShYAmt, WideVT, DL));
31262 ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX),
31263 DAG.getConstant(MaskX, DL, VT));
31264 ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY),
31265 DAG.getConstant(MaskY, DL, VT));
31266 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31267 }
31268
31269 SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
31270 DAG.getShiftAmountConstant(ShXAmt, VT, DL));
31271 SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
31272 DAG.getShiftAmountConstant(ShYAmt, VT, DL));
31273 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31274 }
31275
31276 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31277 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31278 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
31279
31280 // Constant vXi16 funnel shifts can be efficiently handled by default.
31281 if (IsCst && EltSizeInBits == 16)
31282 return SDValue();
31283
31284 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
31285 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31286 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31287
31288 // Split 256-bit integers on XOP/pre-AVX2 targets.
31289 // Split 512-bit integers on non 512-bit BWI targets.
31290 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
31291 !Subtarget.hasAVX2())) ||
31292 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
31293 EltSizeInBits < 32)) {
31294 // Pre-mask the amount modulo using the wider vector.
31295 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
31296 return splitVectorOp(Op, DAG, DL);
31297 }
31298
31299 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
31300 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
31301 int ScalarAmtIdx = -1;
31302 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
31303 // Uniform vXi16 funnel shifts can be efficiently handled by default.
31304 if (EltSizeInBits == 16)
31305 return SDValue();
31306
31307 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31308 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31309 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
31310 ScalarAmtIdx, Subtarget, DAG);
31311 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
31312 ScalarAmtIdx, Subtarget, DAG);
31313 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31314 }
31315 }
31316
31317 MVT WideSVT = MVT::getIntegerVT(
31318 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
31319 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
31320
31321 // If per-element shifts are legal, fallback to generic expansion.
31322 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
31323 return SDValue();
31324
31325 // Attempt to fold as:
31326 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31327 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31328 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31329 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31330 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
31331 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
31332 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31333 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
31334 EltSizeInBits, DAG);
31335 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
31336 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
31337 if (!IsFSHR)
31338 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
31339 EltSizeInBits, DAG);
31340 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
31341 }
31342
31343 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
31344 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
31345 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
31346 SDValue Z = DAG.getConstant(0, DL, VT);
31347 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31348 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31349 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31350 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31351 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31352 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31353 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31354 }
31355
31356 // Fallback to generic expansion.
31357 return SDValue();
31358 }
31359 assert(
31360 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
31361 "Unexpected funnel shift type!");
31362
31363 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
31364 bool OptForSize = DAG.shouldOptForSize();
31365 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
31366
31367 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31368 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31369 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
31370 !isa<ConstantSDNode>(Amt)) {
31371 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
31372 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
31373 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
31374 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
31375 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
31376 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
31377 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
31378 if (IsFSHR) {
31379 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
31380 } else {
31381 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
31382 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
31383 }
31384 return DAG.getZExtOrTrunc(Res, DL, VT);
31385 }
31386
31387 if (VT == MVT::i8 || ExpandFunnel)
31388 return SDValue();
31389
31390 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
31391 if (VT == MVT::i16) {
31392 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
31393 DAG.getConstant(15, DL, Amt.getValueType()));
31394 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
31395 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
31396 }
31397
31398 return Op;
31399}
31400
31401static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
31402 SelectionDAG &DAG) {
31403 MVT VT = Op.getSimpleValueType();
31404 assert(VT.isVector() && "Custom lowering only for vector rotates!");
31405
31406 SDLoc DL(Op);
31407 SDValue R = Op.getOperand(0);
31408 SDValue Amt = Op.getOperand(1);
31409 unsigned Opcode = Op.getOpcode();
31410 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31411 int NumElts = VT.getVectorNumElements();
31412 bool IsROTL = Opcode == ISD::ROTL;
31413
31414 // Check for constant splat rotation amount.
31415 APInt CstSplatValue;
31416 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
31417
31418 // Check for splat rotate by zero.
31419 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
31420 return R;
31421
31422 // AVX512 implicitly uses modulo rotation amounts.
31423 if ((Subtarget.hasVLX() || Subtarget.hasAVX512()) && 32 <= EltSizeInBits) {
31424 // Attempt to rotate by immediate.
31425 if (IsCstSplat) {
31426 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
31427 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31428 return DAG.getNode(RotOpc, DL, VT, R,
31429 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31430 }
31431
31432 // Else, fall-back on VPROLV/VPRORV.
31433 return Op;
31434 }
31435
31436 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
31437 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
31438 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31439 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31440 }
31441
31442 SDValue Z = DAG.getConstant(0, DL, VT);
31443
31444 if (!IsROTL) {
31445 // If the ISD::ROTR amount is constant, we're always better converting to
31446 // ISD::ROTL.
31447 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
31448 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
31449
31450 // XOP targets always prefers ISD::ROTL.
31451 if (Subtarget.hasXOP())
31452 return DAG.getNode(ISD::ROTL, DL, VT, R,
31453 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
31454 }
31455
31456 // Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
31457 if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
31459 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31460 SDValue Mask = getGFNICtrlMask(Opcode, DAG, DL, VT, RotAmt);
31461 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
31462 DAG.getTargetConstant(0, DL, MVT::i8));
31463 }
31464
31465 // Split 256-bit integers on XOP/pre-AVX2 targets.
31466 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
31467 return splitVectorIntBinary(Op, DAG, DL);
31468
31469 // XOP has 128-bit vector variable + immediate rotates.
31470 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
31471 // XOP implicitly uses modulo rotation amounts.
31472 if (Subtarget.hasXOP()) {
31473 assert(IsROTL && "Only ROTL expected");
31474 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
31475
31476 // Attempt to rotate by immediate.
31477 if (IsCstSplat) {
31478 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31479 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
31480 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31481 }
31482
31483 // Use general rotate by variable (per-element).
31484 return Op;
31485 }
31486
31487 // Rotate by an uniform constant - expand back to shifts.
31488 // TODO: Can't use generic expansion as UNDEF amt elements can be converted
31489 // to other values when folded to shift amounts, losing the splat.
31490 if (IsCstSplat) {
31491 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31492 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
31493 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
31494 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,
31495 DAG.getShiftAmountConstant(ShlAmt, VT, DL));
31496 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,
31497 DAG.getShiftAmountConstant(SrlAmt, VT, DL));
31498 return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);
31499 }
31500
31501 // Split 512-bit integers on non 512-bit BWI targets.
31502 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
31503 return splitVectorIntBinary(Op, DAG, DL);
31504
31505 assert(
31506 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
31507 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
31508 Subtarget.hasAVX2()) ||
31509 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
31510 "Only vXi32/vXi16/vXi8 vector rotates supported");
31511
31512 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31513 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31514
31515 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31516 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31517
31518 // Attempt to fold as unpack(x,x) << zext(splat(y)):
31519 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31520 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31521 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
31522 int BaseRotAmtIdx = -1;
31523 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
31524 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
31525 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31526 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31527 }
31528 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
31529 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31530 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31531 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
31532 BaseRotAmtIdx, Subtarget, DAG);
31533 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
31534 BaseRotAmtIdx, Subtarget, DAG);
31535 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31536 }
31537 }
31538
31539 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31540 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
31541
31542 // Attempt to fold as unpack(x,x) << zext(y):
31543 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31544 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31545 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
31546 if (!(ConstantAmt && EltSizeInBits != 8) &&
31547 !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
31548 (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
31549 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31550 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31551 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31552 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31553 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31554 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31555 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31556 }
31557
31558 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
31559 // the amount bit.
31560 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
31561 if (EltSizeInBits == 8) {
31562 MVT WideVT =
31563 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
31564
31565 // Attempt to fold as:
31566 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
31567 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
31568 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31569 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31570 // If we're rotating by constant, just use default promotion.
31571 if (ConstantAmt)
31572 return SDValue();
31573 // See if we can perform this by widening to vXi16 or vXi32.
31574 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
31575 R = DAG.getNode(
31576 ISD::OR, DL, WideVT, R,
31577 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
31578 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31579 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
31580 if (IsROTL)
31581 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
31582 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
31583 }
31584
31585 // We don't need ModuloAmt here as we just peek at individual bits.
31586 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31587 if (Subtarget.hasSSE41()) {
31588 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31589 // on the sign bit.
31590 V0 = DAG.getBitcast(VT, V0);
31591 V1 = DAG.getBitcast(VT, V1);
31592 Sel = DAG.getBitcast(VT, Sel);
31593 return DAG.getBitcast(SelVT,
31594 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
31595 }
31596 // On pre-SSE41 targets we test for the sign bit by comparing to
31597 // zero - a negative value will set all bits of the lanes to true
31598 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31599 SDValue Z = DAG.getConstant(0, DL, SelVT);
31600 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
31601 return DAG.getSelect(DL, SelVT, C, V0, V1);
31602 };
31603
31604 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
31605 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
31606 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31607 IsROTL = true;
31608 }
31609
31610 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
31611 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
31612
31613 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31614 // We can safely do this using i16 shifts as we're only interested in
31615 // the 3 lower bits of each byte.
31616 Amt = DAG.getBitcast(ExtVT, Amt);
31617 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
31618 Amt = DAG.getBitcast(VT, Amt);
31619
31620 // r = VSELECT(r, rot(r, 4), a);
31621 SDValue M;
31622 M = DAG.getNode(
31623 ISD::OR, DL, VT,
31624 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
31625 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
31626 R = SignBitSelect(VT, Amt, M, R);
31627
31628 // a += a
31629 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31630
31631 // r = VSELECT(r, rot(r, 2), a);
31632 M = DAG.getNode(
31633 ISD::OR, DL, VT,
31634 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
31635 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
31636 R = SignBitSelect(VT, Amt, M, R);
31637
31638 // a += a
31639 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31640
31641 // return VSELECT(r, rot(r, 1), a);
31642 M = DAG.getNode(
31643 ISD::OR, DL, VT,
31644 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
31645 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
31646 return SignBitSelect(VT, Amt, M, R);
31647 }
31648
31649 bool IsSplatAmt = DAG.isSplatValue(Amt);
31650 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
31651 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
31652
31653 // Fallback for splats + all supported variable shifts.
31654 // Fallback for non-constants AVX2 vXi16 as well.
31655 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
31656 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31657 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
31658 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
31659 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
31660 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
31661 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
31662 }
31663
31664 // Everything below assumes ISD::ROTL.
31665 if (!IsROTL) {
31666 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31667 IsROTL = true;
31668 }
31669
31670 // ISD::ROT* uses modulo rotate amounts.
31671 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31672
31673 assert(IsROTL && "Only ROTL supported");
31674
31675 // As with shifts, attempt to convert the rotation amount to a multiplication
31676 // factor, fallback to general expansion.
31677 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
31678 if (!Scale)
31679 return SDValue();
31680
31681 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
31682 if (EltSizeInBits == 16) {
31683 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
31684 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
31685 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31686 }
31687
31688 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
31689 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
31690 // that can then be OR'd with the lower 32-bits.
31691 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
31692 static const int OddMask[] = {1, 1, 3, 3};
31693 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
31694 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
31695
31696 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31697 DAG.getBitcast(MVT::v2i64, R),
31698 DAG.getBitcast(MVT::v2i64, Scale));
31699 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31700 DAG.getBitcast(MVT::v2i64, R13),
31701 DAG.getBitcast(MVT::v2i64, Scale13));
31702 Res02 = DAG.getBitcast(VT, Res02);
31703 Res13 = DAG.getBitcast(VT, Res13);
31704
31705 return DAG.getNode(ISD::OR, DL, VT,
31706 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
31707 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
31708}
31709
31710/// Returns true if the operand type is exactly twice the native width, and
31711/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
31712/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
31713/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
31714bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
31715 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
31716
31717 if (OpWidth == 64)
31718 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
31719 if (OpWidth == 128)
31720 return Subtarget.canUseCMPXCHG16B();
31721
31722 return false;
31723}
31724
31726X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
31727 Type *MemType = SI->getValueOperand()->getType();
31728
31729 if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31730 !Subtarget.useSoftFloat()) {
31731 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31732 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31734
31735 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31736 Subtarget.hasAVX())
31738 }
31739
31740 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
31742}
31743
31744// Note: this turns large loads into lock cmpxchg8b/16b.
31746X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
31747 Type *MemType = LI->getType();
31748
31749 if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31750 !Subtarget.useSoftFloat()) {
31751 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
31752 // can use movq to do the load. If we have X87 we can load into an 80-bit
31753 // X87 register and store it to a stack temporary.
31754 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31755 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31757
31758 // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
31759 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31760 Subtarget.hasAVX())
31762 }
31763
31764 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
31766}
31767
31775
31776static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
31777 using namespace llvm::PatternMatch;
31778 BitTestKind BTK = UndefBit;
31779 if (auto *C = dyn_cast<ConstantInt>(V)) {
31780 // Check if V is a power of 2 or NOT power of 2.
31781 if (isPowerOf2_64(C->getZExtValue()))
31782 BTK = ConstantBit;
31783 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
31784 BTK = NotConstantBit;
31785 return {V, BTK};
31786 }
31787
31788 // Check if V is some power of 2 pattern known to be non-zero
31789 if (auto *I = dyn_cast<Instruction>(V)) {
31790 bool Not = false;
31791 // Check if we have a NOT
31792 Value *PeekI;
31793 if (match(I, m_Not(m_Value(PeekI))) ||
31794 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
31795 Not = true;
31796 I = dyn_cast<Instruction>(PeekI);
31797
31798 // If I is constant, it will fold and we can evaluate later. If its an
31799 // argument or something of that nature, we can't analyze.
31800 if (I == nullptr)
31801 return {nullptr, UndefBit};
31802 }
31803 // We can only use 1 << X without more sophisticated analysis. C << X where
31804 // C is a power of 2 but not 1 can result in zero which cannot be translated
31805 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
31806 if (I->getOpcode() == Instruction::Shl) {
31807 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
31808 // -X` and some other provable power of 2 patterns that we can use CTZ on
31809 // may be profitable.
31810 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
31811 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
31812 // be provably a non-zero power of 2.
31813 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
31814 // transformable to bittest.
31815 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
31816 if (!ShiftVal)
31817 return {nullptr, UndefBit};
31818 if (ShiftVal->equalsInt(1))
31819 BTK = Not ? NotShiftBit : ShiftBit;
31820
31821 if (BTK == UndefBit)
31822 return {nullptr, UndefBit};
31823
31824 Value *BitV = I->getOperand(1);
31825
31826 // Read past a shiftmask instruction to find count
31827 Value *AndOp;
31828 uint64_t ShiftMask = I->getType()->getPrimitiveSizeInBits() - 1;
31829 if (match(BitV, m_c_And(m_Value(AndOp), m_SpecificInt(ShiftMask))))
31830 BitV = AndOp;
31831
31832 return {BitV, BTK};
31833 }
31834 }
31835 return {nullptr, UndefBit};
31836}
31837
31839X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
31840 using namespace llvm::PatternMatch;
31841 // If the atomicrmw's result isn't actually used, we can just add a "lock"
31842 // prefix to a normal instruction for these operations.
31843 if (AI->use_empty())
31845
31846 if (AI->getOperation() == AtomicRMWInst::Xor) {
31847 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
31848 // preferable to both `cmpxchg` and `btc`.
31849 if (match(AI->getOperand(1), m_SignMask()))
31851 }
31852
31853 // If the atomicrmw's result is used by a single bit AND, we may use
31854 // bts/btr/btc instruction for these operations.
31855 // Note: InstCombinePass can cause a de-optimization here. It replaces the
31856 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
31857 // (depending on CC). This pattern can only use bts/btr/btc but we don't
31858 // detect it.
31859 Instruction *I = AI->user_back();
31860 auto BitChange = FindSingleBitChange(AI->getValOperand());
31861 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
31862 I->getOpcode() != Instruction::And ||
31863 AI->getType()->getPrimitiveSizeInBits() == 8 ||
31864 AI->getParent() != I->getParent())
31866
31867 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
31868
31869 // This is a redundant AND, it should get cleaned up elsewhere.
31870 if (AI == I->getOperand(OtherIdx))
31872
31873 // The following instruction must be a AND single bit.
31874 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
31875 auto *C1 = cast<ConstantInt>(AI->getValOperand());
31876 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
31877 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
31879 }
31880 if (AI->getOperation() == AtomicRMWInst::And) {
31881 return ~C1->getValue() == C2->getValue()
31884 }
31887 }
31888
31889 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
31890
31891 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
31892 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
31894
31895 assert(BitChange.first != nullptr && BitTested.first != nullptr);
31896
31897 // If shift amounts are not the same we can't use BitTestIntrinsic.
31898 if (BitChange.first != BitTested.first)
31900
31901 // If atomic AND need to be masking all be one bit and testing the one bit
31902 // unset in the mask.
31903 if (AI->getOperation() == AtomicRMWInst::And)
31904 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
31907
31908 // If atomic XOR/OR need to be setting and testing the same bit.
31909 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
31912}
31913
31914void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
31915 IRBuilder<> Builder(AI);
31916 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
31919 switch (AI->getOperation()) {
31920 default:
31921 llvm_unreachable("Unknown atomic operation");
31922 case AtomicRMWInst::Or:
31923 IID_C = Intrinsic::x86_atomic_bts;
31924 IID_I = Intrinsic::x86_atomic_bts_rm;
31925 break;
31926 case AtomicRMWInst::Xor:
31927 IID_C = Intrinsic::x86_atomic_btc;
31928 IID_I = Intrinsic::x86_atomic_btc_rm;
31929 break;
31930 case AtomicRMWInst::And:
31931 IID_C = Intrinsic::x86_atomic_btr;
31932 IID_I = Intrinsic::x86_atomic_btr_rm;
31933 break;
31934 }
31935 Instruction *I = AI->user_back();
31936 LLVMContext &Ctx = AI->getContext();
31937 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31939 Value *Result = nullptr;
31940 auto BitTested = FindSingleBitChange(AI->getValOperand());
31941 assert(BitTested.first != nullptr);
31942
31943 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
31944 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
31945
31946 unsigned Imm = llvm::countr_zero(C->getZExtValue());
31947 Result = Builder.CreateIntrinsic(IID_C, AI->getType(),
31948 {Addr, Builder.getInt8(Imm)});
31949 } else {
31950 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
31951
31952 Value *SI = BitTested.first;
31953 assert(SI != nullptr);
31954
31955 // BT{S|R|C} on memory operand don't modulo bit position so we need to
31956 // mask it.
31957 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
31958 Value *BitPos =
31959 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
31960 // Todo(1): In many cases it may be provable that SI is less than
31961 // ShiftBits in which case this mask is unnecessary
31962 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
31963 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
31964 // favor of just a raw BT{S|R|C}.
31965
31966 Result = Builder.CreateIntrinsic(IID_I, AI->getType(), {Addr, BitPos});
31967 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
31968
31969 // If the result is only used for zero/non-zero status then we don't need to
31970 // shift value back. Otherwise do so.
31971 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
31972 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
31973 if (ICmp->isEquality()) {
31974 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
31975 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
31976 if (C0 || C1) {
31977 assert(C0 == nullptr || C1 == nullptr);
31978 if ((C0 ? C0 : C1)->isZero())
31979 continue;
31980 }
31981 }
31982 }
31983 Result = Builder.CreateShl(Result, BitPos);
31984 break;
31985 }
31986 }
31987
31988 I->replaceAllUsesWith(Result);
31989 I->eraseFromParent();
31990 AI->eraseFromParent();
31991}
31992
31994 using namespace llvm::PatternMatch;
31995 if (!AI->hasOneUse())
31996 return false;
31997
31998 Value *Op = AI->getOperand(1);
31999 CmpPredicate Pred;
32000 Instruction *I = AI->user_back();
32002 if (Opc == AtomicRMWInst::Add) {
32003 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
32004 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32005 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
32006 if (match(I->user_back(),
32008 return true;
32009 if (match(I->user_back(),
32011 return true;
32012 }
32013 return false;
32014 }
32015 if (Opc == AtomicRMWInst::Sub) {
32016 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32017 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32018 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
32019 if (match(I->user_back(),
32021 return true;
32022 if (match(I->user_back(),
32024 return true;
32025 }
32026 return false;
32027 }
32028 if ((Opc == AtomicRMWInst::Or &&
32030 (Opc == AtomicRMWInst::And &&
32032 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32033 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
32034 Pred == CmpInst::ICMP_SLT;
32035 if (match(I->user_back(),
32037 return true;
32038 return false;
32039 }
32040 if (Opc == AtomicRMWInst::Xor) {
32041 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32042 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32043 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
32044 if (match(I->user_back(),
32046 return true;
32047 if (match(I->user_back(),
32049 return true;
32050 }
32051 return false;
32052 }
32053
32054 return false;
32055}
32056
32057void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
32058 AtomicRMWInst *AI) const {
32059 IRBuilder<> Builder(AI);
32060 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32061 Instruction *TempI = nullptr;
32062 LLVMContext &Ctx = AI->getContext();
32063 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
32064 if (!ICI) {
32065 TempI = AI->user_back();
32066 assert(TempI->hasOneUse() && "Must have one use");
32067 ICI = cast<ICmpInst>(TempI->user_back());
32068 }
32070 ICmpInst::Predicate Pred = ICI->getPredicate();
32071 switch (Pred) {
32072 default:
32073 llvm_unreachable("Not supported Pred");
32074 case CmpInst::ICMP_EQ:
32075 CC = X86::COND_E;
32076 break;
32077 case CmpInst::ICMP_NE:
32078 CC = X86::COND_NE;
32079 break;
32080 case CmpInst::ICMP_SLT:
32081 CC = X86::COND_S;
32082 break;
32083 case CmpInst::ICMP_SGT:
32084 CC = X86::COND_NS;
32085 break;
32086 }
32088 switch (AI->getOperation()) {
32089 default:
32090 llvm_unreachable("Unknown atomic operation");
32091 case AtomicRMWInst::Add:
32092 IID = Intrinsic::x86_atomic_add_cc;
32093 break;
32094 case AtomicRMWInst::Sub:
32095 IID = Intrinsic::x86_atomic_sub_cc;
32096 break;
32097 case AtomicRMWInst::Or:
32098 IID = Intrinsic::x86_atomic_or_cc;
32099 break;
32100 case AtomicRMWInst::And:
32101 IID = Intrinsic::x86_atomic_and_cc;
32102 break;
32103 case AtomicRMWInst::Xor:
32104 IID = Intrinsic::x86_atomic_xor_cc;
32105 break;
32106 }
32107 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
32109 Value *Call = Builder.CreateIntrinsic(
32110 IID, AI->getType(),
32111 {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
32112 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
32113 ICI->replaceAllUsesWith(Result);
32114 ICI->eraseFromParent();
32115 if (TempI)
32116 TempI->eraseFromParent();
32117 AI->eraseFromParent();
32118}
32119
32121X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
32122 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32123 Type *MemType = AI->getType();
32124
32125 // If the operand is too big, we must see if cmpxchg8/16b is available
32126 // and default to library calls otherwise.
32127 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
32128 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
32130 }
32131
32133 switch (Op) {
32136 case AtomicRMWInst::Add:
32137 case AtomicRMWInst::Sub:
32140 // It's better to use xadd, xsub or xchg for these in other cases.
32142 case AtomicRMWInst::Or:
32143 case AtomicRMWInst::And:
32144 case AtomicRMWInst::Xor:
32147 return shouldExpandLogicAtomicRMWInIR(AI);
32149 case AtomicRMWInst::Max:
32150 case AtomicRMWInst::Min:
32161 default:
32162 // These always require a non-trivial set of data operations on x86. We must
32163 // use a cmpxchg loop.
32165 }
32166}
32167
32168LoadInst *
32169X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
32170 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32171 Type *MemType = AI->getType();
32172 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
32173 // there is no benefit in turning such RMWs into loads, and it is actually
32174 // harmful as it introduces a mfence.
32175 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
32176 return nullptr;
32177
32178 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
32179 // lowering available in lowerAtomicArith.
32180 // TODO: push more cases through this path.
32181 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
32182 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
32183 AI->use_empty())
32184 return nullptr;
32185
32186 IRBuilder<> Builder(AI);
32187 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32188 auto SSID = AI->getSyncScopeID();
32189 // We must restrict the ordering to avoid generating loads with Release or
32190 // ReleaseAcquire orderings.
32192
32193 // Before the load we need a fence. Here is an example lifted from
32194 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
32195 // is required:
32196 // Thread 0:
32197 // x.store(1, relaxed);
32198 // r1 = y.fetch_add(0, release);
32199 // Thread 1:
32200 // y.fetch_add(42, acquire);
32201 // r2 = x.load(relaxed);
32202 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
32203 // lowered to just a load without a fence. A mfence flushes the store buffer,
32204 // making the optimization clearly correct.
32205 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
32206 // otherwise, we might be able to be more aggressive on relaxed idempotent
32207 // rmw. In practice, they do not look useful, so we don't try to be
32208 // especially clever.
32209
32210 // Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
32211 // lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence
32212 Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
32213
32214 // Finally we can emit the atomic load.
32215 LoadInst *Loaded = Builder.CreateAlignedLoad(
32216 AI->getType(), AI->getPointerOperand(), AI->getAlign());
32217 Loaded->setAtomic(Order, SSID);
32218 AI->replaceAllUsesWith(Loaded);
32219 AI->eraseFromParent();
32220 return Loaded;
32221}
32222
32223/// Emit a locked operation on a stack location which does not change any
32224/// memory location, but does involve a lock prefix. Location is chosen to be
32225/// a) very likely accessed only by a single thread to minimize cache traffic,
32226/// and b) definitely dereferenceable. Returns the new Chain result.
32228 const X86Subtarget &Subtarget, SDValue Chain,
32229 const SDLoc &DL) {
32230 // Implementation notes:
32231 // 1) LOCK prefix creates a full read/write reordering barrier for memory
32232 // operations issued by the current processor. As such, the location
32233 // referenced is not relevant for the ordering properties of the instruction.
32234 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
32235 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
32236 // 2) Using an immediate operand appears to be the best encoding choice
32237 // here since it doesn't require an extra register.
32238 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
32239 // is small enough it might just be measurement noise.)
32240 // 4) When choosing offsets, there are several contributing factors:
32241 // a) If there's no redzone, we default to TOS. (We could allocate a cache
32242 // line aligned stack object to improve this case.)
32243 // b) To minimize our chances of introducing a false dependence, we prefer
32244 // to offset the stack usage from TOS slightly.
32245 // c) To minimize concerns about cross thread stack usage - in particular,
32246 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
32247 // captures state in the TOS frame and accesses it from many threads -
32248 // we want to use an offset such that the offset is in a distinct cache
32249 // line from the TOS frame.
32250 //
32251 // For a general discussion of the tradeoffs and benchmark results, see:
32252 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
32253
32254 auto &MF = DAG.getMachineFunction();
32255 auto &TFL = *Subtarget.getFrameLowering();
32256 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
32257
32258 if (Subtarget.is64Bit()) {
32259 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32260 SDValue Ops[] = {
32261 DAG.getRegister(X86::RSP, MVT::i64), // Base
32262 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32263 DAG.getRegister(0, MVT::i64), // Index
32264 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32265 DAG.getRegister(0, MVT::i16), // Segment.
32266 Zero,
32267 Chain};
32268 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32269 MVT::Other, Ops);
32270 return SDValue(Res, 1);
32271 }
32272
32273 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32274 SDValue Ops[] = {
32275 DAG.getRegister(X86::ESP, MVT::i32), // Base
32276 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32277 DAG.getRegister(0, MVT::i32), // Index
32278 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32279 DAG.getRegister(0, MVT::i16), // Segment.
32280 Zero,
32281 Chain
32282 };
32283 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32284 MVT::Other, Ops);
32285 return SDValue(Res, 1);
32286}
32287
32289 SelectionDAG &DAG) {
32290 SDLoc dl(Op);
32291 AtomicOrdering FenceOrdering =
32292 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
32293 SyncScope::ID FenceSSID =
32294 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
32295
32296 // The only fence that needs an instruction is a sequentially-consistent
32297 // cross-thread fence.
32298 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
32299 FenceSSID == SyncScope::System) {
32300 if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
32301 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
32302
32303 SDValue Chain = Op.getOperand(0);
32304 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
32305 }
32306
32307 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32308 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
32309}
32310
32312 SelectionDAG &DAG) {
32313 MVT T = Op.getSimpleValueType();
32314 SDLoc DL(Op);
32315 unsigned Reg = 0;
32316 unsigned size = 0;
32317 switch(T.SimpleTy) {
32318 default: llvm_unreachable("Invalid value type!");
32319 case MVT::i8: Reg = X86::AL; size = 1; break;
32320 case MVT::i16: Reg = X86::AX; size = 2; break;
32321 case MVT::i32: Reg = X86::EAX; size = 4; break;
32322 case MVT::i64:
32323 assert(Subtarget.is64Bit() && "Node not type legal!");
32324 Reg = X86::RAX; size = 8;
32325 break;
32326 }
32327 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
32328 Op.getOperand(2), SDValue());
32329 SDValue Ops[] = { cpIn.getValue(0),
32330 Op.getOperand(1),
32331 Op.getOperand(3),
32332 DAG.getTargetConstant(size, DL, MVT::i8),
32333 cpIn.getValue(1) };
32334 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
32335 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
32337 Ops, T, MMO);
32338
32339 SDValue cpOut =
32340 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
32341 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
32342 MVT::i32, cpOut.getValue(2));
32343 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
32344
32345 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
32346 cpOut, Success, EFLAGS.getValue(1));
32347}
32348
32349// Create MOVMSKB, taking into account whether we need to split for AVX1.
32351 const X86Subtarget &Subtarget) {
32352 MVT InVT = V.getSimpleValueType();
32353
32354 if (InVT == MVT::v64i8) {
32355 SDValue Lo, Hi;
32356 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32357 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
32358 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
32359 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
32360 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
32361 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
32362 DAG.getConstant(32, DL, MVT::i8));
32363 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
32364 }
32365 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
32366 SDValue Lo, Hi;
32367 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32368 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
32369 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
32370 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
32371 DAG.getConstant(16, DL, MVT::i8));
32372 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
32373 }
32374
32375 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
32376}
32377
32378static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
32379 SelectionDAG &DAG) {
32380 SDValue Src = Op.getOperand(0);
32381 MVT SrcVT = Src.getSimpleValueType();
32382 MVT DstVT = Op.getSimpleValueType();
32383
32384 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
32385 // half to v32i1 and concatenating the result.
32386 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
32387 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
32388 assert(Subtarget.hasBWI() && "Expected BWI target");
32389 SDLoc dl(Op);
32390 SDValue Lo, Hi;
32391 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
32392 Lo = DAG.getBitcast(MVT::v32i1, Lo);
32393 Hi = DAG.getBitcast(MVT::v32i1, Hi);
32394 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
32395 }
32396
32397 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
32398 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
32399 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
32400 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
32401 SDLoc DL(Op);
32402 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
32403 V = getPMOVMSKB(DL, V, DAG, Subtarget);
32404 return DAG.getZExtOrTrunc(V, DL, DstVT);
32405 }
32406
32407 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
32408 SrcVT == MVT::i64) && "Unexpected VT!");
32409
32410 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32411 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
32412 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
32413 // This conversion needs to be expanded.
32414 return SDValue();
32415
32416 SDLoc dl(Op);
32417 if (SrcVT.isVector()) {
32418 // Widen the vector in input in the case of MVT::v2i32.
32419 // Example: from MVT::v2i32 to MVT::v4i32.
32421 SrcVT.getVectorNumElements() * 2);
32422 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
32423 DAG.getUNDEF(SrcVT));
32424 } else {
32425 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
32426 "Unexpected source type in LowerBITCAST");
32427 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
32428 }
32429
32430 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
32431 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
32432
32433 if (DstVT == MVT::x86mmx)
32434 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
32435
32436 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
32437 DAG.getVectorIdxConstant(0, dl));
32438}
32439
32440/// Compute the horizontal sum of bytes in V for the elements of VT.
32441///
32442/// Requires V to be a byte vector and VT to be an integer vector type with
32443/// wider elements than V's type. The width of the elements of VT determines
32444/// how many bytes of V are summed horizontally to produce each element of the
32445/// result.
32447 const X86Subtarget &Subtarget,
32448 SelectionDAG &DAG) {
32449 SDLoc DL(V);
32450 MVT ByteVecVT = V.getSimpleValueType();
32451 MVT EltVT = VT.getVectorElementType();
32452 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
32453 "Expected value to have byte element type.");
32454 assert(EltVT != MVT::i8 &&
32455 "Horizontal byte sum only makes sense for wider elements!");
32456 unsigned VecSize = VT.getSizeInBits();
32457 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
32458
32459 // PSADBW instruction horizontally add all bytes and leave the result in i64
32460 // chunks, thus directly computes the pop count for v2i64 and v4i64.
32461 if (EltVT == MVT::i64) {
32462 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
32463 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32464 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
32465 return DAG.getBitcast(VT, V);
32466 }
32467
32468 if (EltVT == MVT::i32) {
32469 // We unpack the low half and high half into i32s interleaved with zeros so
32470 // that we can use PSADBW to horizontally sum them. The most useful part of
32471 // this is that it lines up the results of two PSADBW instructions to be
32472 // two v2i64 vectors which concatenated are the 4 population counts. We can
32473 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
32474 SDValue Zeros = DAG.getConstant(0, DL, VT);
32475 SDValue V32 = DAG.getBitcast(VT, V);
32476 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
32477 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
32478
32479 // Do the horizontal sums into two v2i64s.
32480 Zeros = DAG.getConstant(0, DL, ByteVecVT);
32481 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32482 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32483 DAG.getBitcast(ByteVecVT, Low), Zeros);
32484 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32485 DAG.getBitcast(ByteVecVT, High), Zeros);
32486
32487 // Merge them together.
32488 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
32489 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
32490 DAG.getBitcast(ShortVecVT, Low),
32491 DAG.getBitcast(ShortVecVT, High));
32492
32493 return DAG.getBitcast(VT, V);
32494 }
32495
32496 // The only element type left is i16.
32497 assert(EltVT == MVT::i16 && "Unknown how to handle type");
32498
32499 // To obtain pop count for each i16 element starting from the pop count for
32500 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
32501 // right by 8. It is important to shift as i16s as i8 vector shift isn't
32502 // directly supported.
32503 SDValue ShifterV = DAG.getConstant(8, DL, VT);
32504 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32505 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
32506 DAG.getBitcast(ByteVecVT, V));
32507 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32508}
32509
32511 const X86Subtarget &Subtarget,
32512 SelectionDAG &DAG) {
32513 MVT VT = Op.getSimpleValueType();
32514 MVT EltVT = VT.getVectorElementType();
32515 int NumElts = VT.getVectorNumElements();
32516 (void)EltVT;
32517 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
32518
32519 // Implement a lookup table in register by using an algorithm based on:
32520 // http://wm.ite.pl/articles/sse-popcount.html
32521 //
32522 // The general idea is that every lower byte nibble in the input vector is an
32523 // index into a in-register pre-computed pop count table. We then split up the
32524 // input vector in two new ones: (1) a vector with only the shifted-right
32525 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
32526 // masked out higher ones) for each byte. PSHUFB is used separately with both
32527 // to index the in-register table. Next, both are added and the result is a
32528 // i8 vector where each element contains the pop count for input byte.
32529 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
32530 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
32531 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
32532 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
32533
32535 for (int i = 0; i < NumElts; ++i)
32536 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
32537 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
32538 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
32539
32540 // High nibbles
32541 SDValue FourV = DAG.getConstant(4, DL, VT);
32542 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
32543
32544 // Low nibbles
32545 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
32546
32547 // The input vector is used as the shuffle mask that index elements into the
32548 // LUT. After counting low and high nibbles, add the vector to obtain the
32549 // final pop count per i8 element.
32550 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
32551 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
32552 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
32553}
32554
32555// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
32556// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
32558 const X86Subtarget &Subtarget,
32559 SelectionDAG &DAG) {
32560 MVT VT = Op.getSimpleValueType();
32561 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
32562 "Unknown CTPOP type to handle");
32563 SDValue Op0 = Op.getOperand(0);
32564
32565 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
32566 if (Subtarget.hasVPOPCNTDQ()) {
32567 unsigned NumElems = VT.getVectorNumElements();
32568 assert((VT.getVectorElementType() == MVT::i8 ||
32569 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
32570 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
32571 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
32572 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
32573 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
32574 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
32575 }
32576 }
32577
32578 // Decompose 256-bit ops into smaller 128-bit ops.
32579 if (VT.is256BitVector() && !Subtarget.hasInt256())
32580 return splitVectorIntUnary(Op, DAG, DL);
32581
32582 // Decompose 512-bit ops into smaller 256-bit ops.
32583 if (VT.is512BitVector() && !Subtarget.hasBWI())
32584 return splitVectorIntUnary(Op, DAG, DL);
32585
32586 // For element types greater than i8, do vXi8 pop counts and a bytesum.
32587 if (VT.getScalarType() != MVT::i8) {
32588 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32589 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
32590 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
32591 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
32592 }
32593
32594 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
32595 if (!Subtarget.hasSSSE3())
32596 return SDValue();
32597
32598 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
32599}
32600
32601static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
32602 SelectionDAG &DAG) {
32603 MVT VT = N.getSimpleValueType();
32604 SDValue Op = N.getOperand(0);
32605 SDLoc DL(N);
32606
32607 if (VT.isScalarInteger()) {
32608 // Compute the lower/upper bounds of the active bits of the value,
32609 // allowing us to shift the active bits down if necessary to fit into the
32610 // special cases below.
32611 KnownBits Known = DAG.computeKnownBits(Op);
32612 if (Known.isConstant())
32613 return DAG.getConstant(Known.getConstant().popcount(), DL, VT);
32614 unsigned LZ = Known.countMinLeadingZeros();
32615 unsigned TZ = Known.countMinTrailingZeros();
32616 assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask");
32617 unsigned ActiveBits = Known.getBitWidth() - LZ;
32618 unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
32619
32620 // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
32621 if (ShiftedActiveBits <= 2) {
32622 if (ActiveBits > 2)
32623 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32624 DAG.getShiftAmountConstant(TZ, VT, DL));
32625 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32626 Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op,
32627 DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32628 DAG.getShiftAmountConstant(1, VT, DL)));
32629 return DAG.getZExtOrTrunc(Op, DL, VT);
32630 }
32631
32632 // i3 CTPOP - perform LUT into i32 integer.
32633 if (ShiftedActiveBits <= 3) {
32634 if (ActiveBits > 3)
32635 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32636 DAG.getShiftAmountConstant(TZ, VT, DL));
32637 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32638 Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op,
32639 DAG.getShiftAmountConstant(1, VT, DL));
32640 Op = DAG.getNode(ISD::SRL, DL, MVT::i32,
32641 DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op);
32642 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op,
32643 DAG.getConstant(0x3, DL, MVT::i32));
32644 return DAG.getZExtOrTrunc(Op, DL, VT);
32645 }
32646
32647 // i4 CTPOP - perform LUT into i64 integer.
32648 if (ShiftedActiveBits <= 4 &&
32649 DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) {
32650 SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64);
32651 if (ActiveBits > 4)
32652 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32653 DAG.getShiftAmountConstant(TZ, VT, DL));
32654 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32655 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32656 DAG.getConstant(4, DL, MVT::i32));
32657 Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT,
32658 DAG.getShiftAmountOperand(MVT::i64, Op));
32659 Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op,
32660 DAG.getConstant(0x7, DL, MVT::i64));
32661 return DAG.getZExtOrTrunc(Op, DL, VT);
32662 }
32663
32664 // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
32665 if (ShiftedActiveBits <= 8) {
32666 SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
32667 if (ActiveBits > 8)
32668 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32669 DAG.getShiftAmountConstant(TZ, VT, DL));
32670 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32671 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32672 DAG.getConstant(0x08040201U, DL, MVT::i32));
32673 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32674 DAG.getShiftAmountConstant(3, MVT::i32, DL));
32675 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
32676 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
32677 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32678 DAG.getShiftAmountConstant(28, MVT::i32, DL));
32679 return DAG.getZExtOrTrunc(Op, DL, VT);
32680 }
32681
32682 return SDValue(); // fallback to generic expansion.
32683 }
32684
32685 assert(VT.isVector() &&
32686 "We only do custom lowering for vector population count.");
32687 return LowerVectorCTPOP(N, DL, Subtarget, DAG);
32688}
32689
32691 MVT VT = Op.getSimpleValueType();
32692 SDValue In = Op.getOperand(0);
32693 SDLoc DL(Op);
32694
32695 // For scalars, its still beneficial to transfer to/from the SIMD unit to
32696 // perform the BITREVERSE.
32697 if (!VT.isVector()) {
32698 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32699 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32700 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
32701 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
32702 DAG.getVectorIdxConstant(0, DL));
32703 }
32704
32705 int NumElts = VT.getVectorNumElements();
32706 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
32707
32708 // Decompose 256-bit ops into smaller 128-bit ops.
32709 if (VT.is256BitVector())
32710 return splitVectorIntUnary(Op, DAG, DL);
32711
32712 assert(VT.is128BitVector() &&
32713 "Only 128-bit vector bitreverse lowering supported.");
32714
32715 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
32716 // perform the BSWAP in the shuffle.
32717 // Its best to shuffle using the second operand as this will implicitly allow
32718 // memory folding for multiple vectors.
32719 SmallVector<SDValue, 16> MaskElts;
32720 for (int i = 0; i != NumElts; ++i) {
32721 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
32722 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
32723 int PermuteByte = SourceByte | (2 << 5);
32724 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
32725 }
32726 }
32727
32728 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
32729 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
32730 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
32731 Res, Mask);
32732 return DAG.getBitcast(VT, Res);
32733}
32734
32736 SelectionDAG &DAG) {
32737 MVT VT = Op.getSimpleValueType();
32738
32739 if (Subtarget.hasXOP() && !VT.is512BitVector())
32740 return LowerBITREVERSE_XOP(Op, DAG);
32741
32742 assert((Subtarget.hasSSSE3() || Subtarget.hasGFNI()) &&
32743 "SSSE3 or GFNI required for BITREVERSE");
32744
32745 SDValue In = Op.getOperand(0);
32746 SDLoc DL(Op);
32747
32748 // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
32749 if (VT.is512BitVector() && !Subtarget.hasBWI())
32750 return splitVectorIntUnary(Op, DAG, DL);
32751
32752 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
32753 if (VT.is256BitVector() && !Subtarget.hasInt256())
32754 return splitVectorIntUnary(Op, DAG, DL);
32755
32756 // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP
32757 if (!VT.isVector()) {
32758 assert(
32759 (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&
32760 "Only tested for i8/i16/i32/i64");
32761 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32762 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32763 Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
32764 DAG.getBitcast(MVT::v16i8, Res));
32765 Res =
32766 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, DAG.getBitcast(VecVT, Res),
32767 DAG.getVectorIdxConstant(0, DL));
32768 return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);
32769 }
32770
32771 assert(VT.isVector() && VT.getSizeInBits() >= 128);
32772
32773 // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
32774 if (VT.getScalarType() != MVT::i8) {
32775 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32776 SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);
32777 Res = DAG.getBitcast(ByteVT, Res);
32778 Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);
32779 return DAG.getBitcast(VT, Res);
32780 }
32781 assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&
32782 "Only byte vector BITREVERSE supported");
32783
32784 unsigned NumElts = VT.getVectorNumElements();
32785
32786 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
32787 if (Subtarget.hasGFNI()) {
32789 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
32790 DAG.getTargetConstant(0, DL, MVT::i8));
32791 }
32792
32793 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
32794 // two nibbles and a PSHUFB lookup to find the bitreverse of each
32795 // 0-15 value (moved to the other nibble).
32796 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
32797 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
32798 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
32799
32800 const int LoLUT[16] = {
32801 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
32802 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
32803 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
32804 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
32805 const int HiLUT[16] = {
32806 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
32807 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
32808 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
32809 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
32810
32811 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
32812 for (unsigned i = 0; i < NumElts; ++i) {
32813 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
32814 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
32815 }
32816
32817 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
32818 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
32819 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
32820 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
32821 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
32822}
32823
32824static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
32825 SelectionDAG &DAG) {
32826 SDLoc DL(Op);
32827 SDValue X = Op.getOperand(0);
32828 MVT VT = Op.getSimpleValueType();
32829
32830 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
32831 if (VT == MVT::i8 ||
32833 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32834 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
32835 DAG.getConstant(0, DL, MVT::i8));
32836 // Copy the inverse of the parity flag into a register with setcc.
32837 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32838 // Extend to the original type.
32839 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32840 }
32841
32842 // If we have POPCNT, use the default expansion.
32843 if (Subtarget.hasPOPCNT())
32844 return SDValue();
32845
32846 if (VT == MVT::i64) {
32847 // Xor the high and low 16-bits together using a 32-bit operation.
32848 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
32849 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
32850 DAG.getConstant(32, DL, MVT::i8)));
32851 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
32852 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
32853 }
32854
32855 if (VT != MVT::i16) {
32856 // Xor the high and low 16-bits together using a 32-bit operation.
32857 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
32858 DAG.getConstant(16, DL, MVT::i8));
32859 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
32860 } else {
32861 // If the input is 16-bits, we need to extend to use an i32 shift below.
32862 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
32863 }
32864
32865 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
32866 // This should allow an h-reg to be used to save a shift.
32867 SDValue Hi = DAG.getNode(
32868 ISD::TRUNCATE, DL, MVT::i8,
32869 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
32870 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32871 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
32872 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
32873
32874 // Copy the inverse of the parity flag into a register with setcc.
32875 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32876 // Extend to the original type.
32877 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32878}
32879
32881 const X86Subtarget &Subtarget) {
32882 unsigned NewOpc = 0;
32883 switch (N->getOpcode()) {
32884 case ISD::ATOMIC_LOAD_ADD:
32885 NewOpc = X86ISD::LADD;
32886 break;
32887 case ISD::ATOMIC_LOAD_SUB:
32888 NewOpc = X86ISD::LSUB;
32889 break;
32890 case ISD::ATOMIC_LOAD_OR:
32891 NewOpc = X86ISD::LOR;
32892 break;
32893 case ISD::ATOMIC_LOAD_XOR:
32894 NewOpc = X86ISD::LXOR;
32895 break;
32896 case ISD::ATOMIC_LOAD_AND:
32897 NewOpc = X86ISD::LAND;
32898 break;
32899 default:
32900 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
32901 }
32902
32903 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
32904
32905 return DAG.getMemIntrinsicNode(
32906 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
32907 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
32908 /*MemVT=*/N->getSimpleValueType(0), MMO);
32909}
32910
32911/// Lower atomic_load_ops into LOCK-prefixed operations.
32913 const X86Subtarget &Subtarget) {
32914 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
32915 SDValue Chain = N->getOperand(0);
32916 SDValue LHS = N->getOperand(1);
32917 SDValue RHS = N->getOperand(2);
32918 unsigned Opc = N->getOpcode();
32919 MVT VT = N->getSimpleValueType(0);
32920 SDLoc DL(N);
32921
32922 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
32923 // can only be lowered when the result is unused. They should have already
32924 // been transformed into a cmpxchg loop in AtomicExpand.
32925 if (N->hasAnyUseOfValue(0)) {
32926 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
32927 // select LXADD if LOCK_SUB can't be selected.
32928 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
32929 // can use LXADD as opposed to cmpxchg.
32930 if (Opc == ISD::ATOMIC_LOAD_SUB ||
32931 (Opc == ISD::ATOMIC_LOAD_XOR && isMinSignedConstant(RHS)))
32932 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
32933 DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
32934
32935 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
32936 "Used AtomicRMW ops other than Add should have been expanded!");
32937 return N;
32938 }
32939
32940 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
32941 // The core idea here is that since the memory location isn't actually
32942 // changing, all we need is a lowering for the *ordering* impacts of the
32943 // atomicrmw. As such, we can chose a different operation and memory
32944 // location to minimize impact on other code.
32945 // The above holds unless the node is marked volatile in which
32946 // case it needs to be preserved according to the langref.
32947 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
32948 // On X86, the only ordering which actually requires an instruction is
32949 // seq_cst which isn't SingleThread, everything just needs to be preserved
32950 // during codegen and then dropped. Note that we expect (but don't assume),
32951 // that orderings other than seq_cst and acq_rel have been canonicalized to
32952 // a store or load.
32955 // Prefer a locked operation against a stack location to minimize cache
32956 // traffic. This assumes that stack locations are very likely to be
32957 // accessed only by the owning thread.
32958 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
32959 assert(!N->hasAnyUseOfValue(0));
32960 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32961 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32962 DAG.getUNDEF(VT), NewChain);
32963 }
32964 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32965 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
32966 assert(!N->hasAnyUseOfValue(0));
32967 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32968 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32969 DAG.getUNDEF(VT), NewChain);
32970 }
32971
32972 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
32973 // RAUW the chain, but don't worry about the result, as it's unused.
32974 assert(!N->hasAnyUseOfValue(0));
32975 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32976 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32977 DAG.getUNDEF(VT), LockOp.getValue(1));
32978}
32979
32981 const X86Subtarget &Subtarget) {
32982 auto *Node = cast<AtomicSDNode>(Op.getNode());
32983 SDLoc dl(Node);
32984 EVT VT = Node->getMemoryVT();
32985
32986 bool IsSeqCst =
32987 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
32988 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
32989
32990 // If this store is not sequentially consistent and the type is legal
32991 // we can just keep it.
32992 if (!IsSeqCst && IsTypeLegal)
32993 return Op;
32994
32995 if (!IsTypeLegal && !Subtarget.useSoftFloat() &&
32997 Attribute::NoImplicitFloat)) {
32998 SDValue Chain;
32999 // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a
33000 // vector store.
33001 if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) {
33002 SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
33003 Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
33004 Node->getMemOperand());
33005 }
33006
33007 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
33008 // is enabled.
33009 if (VT == MVT::i64) {
33010 if (Subtarget.hasSSE1()) {
33011 SDValue SclToVec =
33012 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
33013 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
33014 SclToVec = DAG.getBitcast(StVT, SclToVec);
33015 SDVTList Tys = DAG.getVTList(MVT::Other);
33016 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
33017 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
33018 MVT::i64, Node->getMemOperand());
33019 } else if (Subtarget.hasX87()) {
33020 // First load this into an 80-bit X87 register using a stack temporary.
33021 // This will put the whole integer into the significand.
33022 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
33023 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
33024 MachinePointerInfo MPI =
33026 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
33028 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
33029 SDValue LdOps[] = {Chain, StackPtr};
33031 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
33032 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
33033 Chain = Value.getValue(1);
33034
33035 // Now use an FIST to do the atomic store.
33036 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
33037 Chain =
33038 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
33039 StoreOps, MVT::i64, Node->getMemOperand());
33040 }
33041 }
33042
33043 if (Chain) {
33044 // If this is a sequentially consistent store, also emit an appropriate
33045 // barrier.
33046 if (IsSeqCst)
33047 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
33048
33049 return Chain;
33050 }
33051 }
33052
33053 // Convert seq_cst store -> xchg
33054 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
33055 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
33056 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
33057 Node->getOperand(0), Node->getOperand(2),
33058 Node->getOperand(1), Node->getMemOperand());
33059 return Swap.getValue(1);
33060}
33061
33063 SDNode *N = Op.getNode();
33064 MVT VT = N->getSimpleValueType(0);
33065 unsigned Opc = Op.getOpcode();
33066
33067 // Let legalize expand this if it isn't a legal type yet.
33068 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33069 return SDValue();
33070
33071 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
33072 SDLoc DL(N);
33073
33074 // Set the carry flag.
33075 SDValue Carry = Op.getOperand(2);
33076 EVT CarryVT = Carry.getValueType();
33077 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
33078 Carry, DAG.getAllOnesConstant(DL, CarryVT));
33079
33080 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
33081 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
33082 Op.getOperand(0), Op.getOperand(1),
33083 Carry.getValue(1));
33084
33085 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
33086 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
33087 Sum.getValue(1), DL, DAG);
33088 if (N->getValueType(1) == MVT::i1)
33089 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
33090
33091 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
33092}
33093
33094static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
33095 SelectionDAG &DAG) {
33096 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
33097
33098 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
33099 // which returns the values as { float, float } (in XMM0) or
33100 // { double, double } (which is returned in XMM0, XMM1).
33101 SDLoc dl(Op);
33102 SDValue Arg = Op.getOperand(0);
33103 EVT ArgVT = Arg.getValueType();
33104 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
33105
33107 Args.emplace_back(Arg, ArgTy);
33108
33109 bool isF64 = ArgVT == MVT::f64;
33110 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
33111 // the small struct {f32, f32} is returned in (eax, edx). For f64,
33112 // the results are returned via SRet in memory.
33113 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33114 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
33115 const char *LibcallName = TLI.getLibcallName(LC);
33116 SDValue Callee =
33117 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
33118
33119 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
33120 : (Type *)FixedVectorType::get(ArgTy, 4);
33121
33123 CLI.setDebugLoc(dl)
33124 .setChain(DAG.getEntryNode())
33125 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
33126
33127 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
33128
33129 if (isF64)
33130 // Returned in xmm0 and xmm1.
33131 return CallResult.first;
33132
33133 // Returned in bits 0:31 and 32:64 xmm0.
33134 SDValue SinVal =
33135 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33136 DAG.getVectorIdxConstant(0, dl));
33137 SDValue CosVal =
33138 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33139 DAG.getVectorIdxConstant(1, dl));
33140 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
33141 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
33142}
33143
33144/// Widen a vector input to a vector of NVT. The
33145/// input vector must have the same element type as NVT.
33147 bool FillWithZeroes = false) {
33148 // Check if InOp already has the right width.
33149 MVT InVT = InOp.getSimpleValueType();
33150 if (InVT == NVT)
33151 return InOp;
33152
33153 if (InOp.isUndef())
33154 return DAG.getUNDEF(NVT);
33155
33157 "input and widen element type must match");
33158
33159 unsigned InNumElts = InVT.getVectorNumElements();
33160 unsigned WidenNumElts = NVT.getVectorNumElements();
33161 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
33162 "Unexpected request for vector widening");
33163
33164 SDLoc dl(InOp);
33165 if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) {
33166 SDValue N1 = InOp.getOperand(1);
33167 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
33168 N1.isUndef()) {
33169 InOp = InOp.getOperand(0);
33170 InVT = InOp.getSimpleValueType();
33171 InNumElts = InVT.getVectorNumElements();
33172 }
33173 }
33176 EVT EltVT = InOp.getOperand(0).getValueType();
33177 SDValue FillVal =
33178 FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT);
33180 Ops.append(WidenNumElts - InNumElts, FillVal);
33181 return DAG.getBuildVector(NVT, dl, Ops);
33182 }
33183 SDValue FillVal =
33184 FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT);
33185 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, InOp,
33186 DAG.getVectorIdxConstant(0, dl));
33187}
33188
33190 SelectionDAG &DAG) {
33191 assert(Subtarget.hasAVX512() &&
33192 "MGATHER/MSCATTER are supported on AVX-512 arch only");
33193
33195 SDValue Src = N->getValue();
33196 MVT VT = Src.getSimpleValueType();
33197 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
33198 SDLoc dl(Op);
33199
33200 SDValue Scale = N->getScale();
33201 SDValue Index = N->getIndex();
33202 SDValue Mask = N->getMask();
33203 SDValue Chain = N->getChain();
33204 SDValue BasePtr = N->getBasePtr();
33205
33206 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
33207 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
33208 // If the index is v2i64 and we have VLX we can use xmm for data and index.
33209 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
33210 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33211 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
33212 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
33213 SDVTList VTs = DAG.getVTList(MVT::Other);
33214 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33215 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33216 N->getMemoryVT(), N->getMemOperand());
33217 }
33218 return SDValue();
33219 }
33220
33221 MVT IndexVT = Index.getSimpleValueType();
33222
33223 // If the index is v2i32, we're being called by type legalization and we
33224 // should just let the default handling take care of it.
33225 if (IndexVT == MVT::v2i32)
33226 return SDValue();
33227
33228 // If we don't have VLX and neither the passthru or index is 512-bits, we
33229 // need to widen until one is.
33230 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
33231 !Index.getSimpleValueType().is512BitVector()) {
33232 // Determine how much we need to widen by to get a 512-bit type.
33233 unsigned Factor = std::min(512/VT.getSizeInBits(),
33234 512/IndexVT.getSizeInBits());
33235 unsigned NumElts = VT.getVectorNumElements() * Factor;
33236
33237 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33238 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33239 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33240
33241 Src = ExtendToType(Src, VT, DAG);
33242 Index = ExtendToType(Index, IndexVT, DAG);
33243 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33244 }
33245
33246 SDVTList VTs = DAG.getVTList(MVT::Other);
33247 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33248 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33249 N->getMemoryVT(), N->getMemOperand());
33250}
33251
33252static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
33253 SelectionDAG &DAG) {
33254
33256 MVT VT = Op.getSimpleValueType();
33257 MVT ScalarVT = VT.getScalarType();
33258 SDValue Mask = N->getMask();
33259 MVT MaskVT = Mask.getSimpleValueType();
33260 SDValue PassThru = N->getPassThru();
33261 SDLoc dl(Op);
33262
33263 // Handle AVX masked loads which don't support passthru other than 0.
33264 if (MaskVT.getVectorElementType() != MVT::i1) {
33265 // We also allow undef in the isel pattern.
33266 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
33267 return Op;
33268
33269 SDValue NewLoad = DAG.getMaskedLoad(
33270 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33271 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
33272 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
33273 N->isExpandingLoad());
33274 // Emit a blend.
33275 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
33276 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
33277 }
33278
33279 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
33280 "Expanding masked load is supported on AVX-512 target only!");
33281
33282 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
33283 "Expanding masked load is supported for 32 and 64-bit types only!");
33284
33285 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33286 "Cannot lower masked load op.");
33287
33288 assert((ScalarVT.getSizeInBits() >= 32 ||
33289 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33290 ScalarVT == MVT::f16))) &&
33291 "Unsupported masked load op.");
33292
33293 // This operation is legal for targets with VLX, but without
33294 // VLX the vector should be widened to 512 bit
33295 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
33296 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33297 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
33298
33299 // Mask element has to be i1.
33300 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33301 "Unexpected mask type");
33302
33303 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33304
33305 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33306 SDValue NewLoad = DAG.getMaskedLoad(
33307 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33308 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
33309 N->getExtensionType(), N->isExpandingLoad());
33310
33311 SDValue Extract =
33312 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
33313 DAG.getVectorIdxConstant(0, dl));
33314 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
33315 return DAG.getMergeValues(RetOps, dl);
33316}
33317
33318static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
33319 SelectionDAG &DAG) {
33321 SDValue DataToStore = N->getValue();
33322 MVT VT = DataToStore.getSimpleValueType();
33323 MVT ScalarVT = VT.getScalarType();
33324 SDValue Mask = N->getMask();
33325 SDLoc dl(Op);
33326
33327 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
33328 "Expanding masked load is supported on AVX-512 target only!");
33329
33330 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
33331 "Expanding masked load is supported for 32 and 64-bit types only!");
33332
33333 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33334 "Cannot lower masked store op.");
33335
33336 assert((ScalarVT.getSizeInBits() >= 32 ||
33337 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33338 ScalarVT == MVT::f16))) &&
33339 "Unsupported masked store op.");
33340
33341 // This operation is legal for targets with VLX, but without
33342 // VLX the vector should be widened to 512 bit
33343 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
33344 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33345
33346 // Mask element has to be i1.
33347 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33348 "Unexpected mask type");
33349
33350 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33351
33352 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
33353 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33354 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
33355 N->getOffset(), Mask, N->getMemoryVT(),
33356 N->getMemOperand(), N->getAddressingMode(),
33357 N->isTruncatingStore(), N->isCompressingStore());
33358}
33359
33360static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
33361 SelectionDAG &DAG) {
33362 assert(Subtarget.hasAVX2() &&
33363 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
33364
33366 SDLoc dl(Op);
33367 MVT VT = Op.getSimpleValueType();
33368 SDValue Index = N->getIndex();
33369 SDValue Mask = N->getMask();
33370 SDValue PassThru = N->getPassThru();
33371 MVT IndexVT = Index.getSimpleValueType();
33372
33373 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
33374
33375 // If the index is v2i32, we're being called by type legalization.
33376 if (IndexVT == MVT::v2i32)
33377 return SDValue();
33378
33379 // If we don't have VLX and neither the passthru or index is 512-bits, we
33380 // need to widen until one is.
33381 MVT OrigVT = VT;
33382 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33383 !IndexVT.is512BitVector()) {
33384 // Determine how much we need to widen by to get a 512-bit type.
33385 unsigned Factor = std::min(512/VT.getSizeInBits(),
33386 512/IndexVT.getSizeInBits());
33387
33388 unsigned NumElts = VT.getVectorNumElements() * Factor;
33389
33390 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33391 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33392 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33393
33394 PassThru = ExtendToType(PassThru, VT, DAG);
33395 Index = ExtendToType(Index, IndexVT, DAG);
33396 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33397 }
33398
33399 // Break dependency on the data register.
33400 if (PassThru.isUndef())
33401 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
33402
33403 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
33404 N->getScale() };
33405 SDValue NewGather = DAG.getMemIntrinsicNode(
33406 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
33407 N->getMemOperand());
33408 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT, NewGather,
33409 DAG.getVectorIdxConstant(0, dl));
33410 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
33411}
33412
33414 SDLoc dl(Op);
33415 SDValue Src = Op.getOperand(0);
33416 MVT DstVT = Op.getSimpleValueType();
33417
33419 unsigned SrcAS = N->getSrcAddressSpace();
33420
33421 assert(SrcAS != N->getDestAddressSpace() &&
33422 "addrspacecast must be between different address spaces");
33423
33424 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
33425 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
33426 } else if (DstVT == MVT::i64) {
33427 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
33428 } else if (DstVT == MVT::i32) {
33429 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
33430 } else {
33431 report_fatal_error("Bad address space in addrspacecast");
33432 }
33433 return Op;
33434}
33435
33436SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
33437 SelectionDAG &DAG) const {
33438 // TODO: Eventually, the lowering of these nodes should be informed by or
33439 // deferred to the GC strategy for the function in which they appear. For
33440 // now, however, they must be lowered to something. Since they are logically
33441 // no-ops in the case of a null GC strategy (or a GC strategy which does not
33442 // require special handling for these nodes), lower them as literal NOOPs for
33443 // the time being.
33445 Ops.push_back(Op.getOperand(0));
33446 if (Op->getGluedNode())
33447 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
33448
33449 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
33450 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
33451}
33452
33453// Custom split CVTPS2PH with wide types.
33455 SDLoc dl(Op);
33456 EVT VT = Op.getValueType();
33457 SDValue Lo, Hi;
33458 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
33459 EVT LoVT, HiVT;
33460 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33461 SDValue RC = Op.getOperand(1);
33462 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
33463 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
33464 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33465}
33466
33468 SelectionDAG &DAG) {
33469 unsigned IsData = Op.getConstantOperandVal(4);
33470
33471 // We don't support non-data prefetch without PREFETCHI.
33472 // Just preserve the chain.
33473 if (!IsData && !Subtarget.hasPREFETCHI())
33474 return Op.getOperand(0);
33475
33476 return Op;
33477}
33478
33480 SDNode *N = Op.getNode();
33481 SDValue Operand = N->getOperand(0);
33482 EVT VT = Operand.getValueType();
33483 SDLoc dl(N);
33484
33485 SDValue One = DAG.getConstantFP(1.0, dl, VT);
33486
33487 // TODO: Fix Crash for bf16 when generating strict_fmul as it
33488 // leads to a error : SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0,
33489 // ConstantFP:bf16<APFloat(16256)>, t5 LLVM ERROR: Do not know how to soft
33490 // promote this operator's result!
33491 SDValue Chain = DAG.getEntryNode();
33492 SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
33493 {Chain, Operand, One});
33494 return StrictFmul;
33495}
33496
33498 unsigned OpNo) {
33499 const APInt Operand(32, OpNo);
33500 std::string OpNoStr = llvm::toString(Operand, 10, false);
33501 std::string Str(" $");
33502
33503 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
33504 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
33505
33506 auto I = StringRef::npos;
33507 for (auto &AsmStr : AsmStrs) {
33508 // Match the OpNo string. We should match exactly to exclude match
33509 // sub-string, e.g. "$12" contain "$1"
33510 if (AsmStr.ends_with(OpNoStr1))
33511 I = AsmStr.size() - OpNoStr1.size();
33512
33513 // Get the index of operand in AsmStr.
33514 if (I == StringRef::npos)
33515 I = AsmStr.find(OpNoStr1 + ",");
33516 if (I == StringRef::npos)
33517 I = AsmStr.find(OpNoStr2);
33518
33519 if (I == StringRef::npos)
33520 continue;
33521
33522 assert(I > 0 && "Unexpected inline asm string!");
33523 // Remove the operand string and label (if exsit).
33524 // For example:
33525 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
33526 // ==>
33527 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
33528 // ==>
33529 // "call dword ptr "
33530 auto TmpStr = AsmStr.substr(0, I);
33531 I = TmpStr.rfind(':');
33532 if (I != StringRef::npos)
33533 TmpStr = TmpStr.substr(I + 1);
33534 return TmpStr.take_while(llvm::isAlpha);
33535 }
33536
33537 return StringRef();
33538}
33539
33541 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
33542 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
33543 // changed from indirect TargetLowering::C_Memory to direct
33544 // TargetLowering::C_Address.
33545 // We don't need to special case LOOP* and Jcc, which cannot target a memory
33546 // location.
33547 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
33548 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
33549}
33550
33552 SDValue Mask) {
33553 EVT Ty = MVT::i8;
33554 auto V = DAG.getBitcast(MVT::i1, Mask);
33555 auto VE = DAG.getZExtOrTrunc(V, DL, Ty);
33556 auto Zero = DAG.getConstant(0, DL, Ty);
33557 SDVTList X86SubVTs = DAG.getVTList(Ty, MVT::i32);
33558 auto CmpZero = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, VE);
33559 return SDValue(CmpZero.getNode(), 1);
33560}
33561
33563 SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
33564 SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {
33565 // @llvm.masked.load.v1*(ptr, alignment, mask, passthru)
33566 // ->
33567 // _, flags = SUB 0, mask
33568 // res, chain = CLOAD inchain, ptr, (bit_cast_to_scalar passthru), cond, flags
33569 // bit_cast_to_vector<res>
33570 EVT VTy = PassThru.getValueType();
33571 EVT Ty = VTy.getVectorElementType();
33572 SDVTList Tys = DAG.getVTList(Ty, MVT::Other);
33573 auto ScalarPassThru = PassThru.isUndef() ? DAG.getConstant(0, DL, Ty)
33574 : DAG.getBitcast(Ty, PassThru);
33575 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33576 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33577 SDValue Ops[] = {Chain, Ptr, ScalarPassThru, COND_NE, Flags};
33578 NewLoad = DAG.getMemIntrinsicNode(X86ISD::CLOAD, DL, Tys, Ops, Ty, MMO);
33579 return DAG.getBitcast(VTy, NewLoad);
33580}
33581
33583 SDValue Chain,
33585 SDValue Val, SDValue Mask) const {
33586 // llvm.masked.store.v1*(Src0, Ptr, alignment, Mask)
33587 // ->
33588 // _, flags = SUB 0, mask
33589 // chain = CSTORE inchain, (bit_cast_to_scalar val), ptr, cond, flags
33591 SDVTList Tys = DAG.getVTList(MVT::Other);
33592 auto ScalarVal = DAG.getBitcast(Ty, Val);
33593 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33594 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33595 SDValue Ops[] = {Chain, ScalarVal, Ptr, COND_NE, Flags};
33596 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, DL, Tys, Ops, Ty, MMO);
33597}
33598
33599/// Provide custom lowering hooks for some operations.
33601 switch (Op.getOpcode()) {
33602 // clang-format off
33603 default: llvm_unreachable("Should not custom lower this!");
33604 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
33605 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
33606 return LowerCMP_SWAP(Op, Subtarget, DAG);
33607 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
33608 case ISD::ATOMIC_LOAD_ADD:
33609 case ISD::ATOMIC_LOAD_SUB:
33610 case ISD::ATOMIC_LOAD_OR:
33611 case ISD::ATOMIC_LOAD_XOR:
33612 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
33613 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
33614 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
33615 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
33616 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
33617 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
33618 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
33619 case ISD::VECTOR_COMPRESS: return lowerVECTOR_COMPRESS(Op, Subtarget, DAG);
33620 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
33621 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
33622 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
33623 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
33624 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
33625 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
33626 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
33627 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
33628 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
33629 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
33630 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
33631 case ISD::SHL_PARTS:
33632 case ISD::SRA_PARTS:
33633 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
33634 case ISD::FSHL:
33635 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
33636 case ISD::FCANONICALIZE: return LowerFCanonicalize(Op, DAG);
33638 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
33640 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
33641 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
33642 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
33643 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
33644 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
33647 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
33648 case ISD::FP_TO_SINT:
33650 case ISD::FP_TO_UINT:
33651 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
33653 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
33654 case ISD::FP_EXTEND:
33655 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
33656 case ISD::FP_ROUND:
33657 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
33658 case ISD::FP16_TO_FP:
33659 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
33660 case ISD::FP_TO_FP16:
33661 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
33662 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
33663 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
33664 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
33665 case ISD::FADD:
33666 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
33667 case ISD::FROUND: return LowerFROUND(Op, DAG);
33668 case ISD::FABS:
33669 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
33670 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
33671 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
33672 case ISD::LRINT:
33673 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
33674 case ISD::SETCC:
33675 case ISD::STRICT_FSETCC:
33676 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
33677 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
33678 case ISD::SELECT: return LowerSELECT(Op, DAG);
33679 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
33680 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
33681 case ISD::VASTART: return LowerVASTART(Op, DAG);
33682 case ISD::VAARG: return LowerVAARG(Op, DAG);
33683 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
33684 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
33686 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
33687 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
33688 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
33689 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
33691 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
33692 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
33693 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
33694 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
33695 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
33697 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
33698 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
33699 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
33700 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
33701 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
33702 case ISD::GET_FPENV_MEM: return LowerGET_FPENV_MEM(Op, DAG);
33703 case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG);
33704 case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG);
33705 case ISD::CTLZ:
33706 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
33707 case ISD::CTTZ:
33708 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
33709 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
33710 case ISD::MULHS:
33711 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
33712 case ISD::ROTL:
33713 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
33714 case ISD::SRA:
33715 case ISD::SRL:
33716 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
33717 case ISD::SADDO:
33718 case ISD::UADDO:
33719 case ISD::SSUBO:
33720 case ISD::USUBO: return LowerXALUO(Op, DAG);
33721 case ISD::SMULO:
33722 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
33723 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
33724 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
33725 case ISD::SADDO_CARRY:
33726 case ISD::SSUBO_CARRY:
33727 case ISD::UADDO_CARRY:
33728 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
33729 case ISD::ADD:
33730 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
33731 case ISD::UADDSAT:
33732 case ISD::SADDSAT:
33733 case ISD::USUBSAT:
33734 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
33735 case ISD::SMAX:
33736 case ISD::SMIN:
33737 case ISD::UMAX:
33738 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
33739 case ISD::FMINIMUM:
33740 case ISD::FMAXIMUM:
33741 case ISD::FMINIMUMNUM:
33742 case ISD::FMAXIMUMNUM:
33743 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
33744 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
33745 case ISD::ABDS:
33746 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
33747 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
33748 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
33749 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
33750 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
33751 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
33752 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
33753 case ISD::GC_TRANSITION_START:
33754 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
33755 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
33756 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
33757 case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
33758 // clang-format on
33759 }
33760}
33761
33762/// Replace a node with an illegal result type with a new node built out of
33763/// custom code.
33766 SelectionDAG &DAG) const {
33767 SDLoc dl(N);
33768 unsigned Opc = N->getOpcode();
33769 switch (Opc) {
33770 default:
33771#ifndef NDEBUG
33772 dbgs() << "ReplaceNodeResults: ";
33773 N->dump(&DAG);
33774#endif
33775 llvm_unreachable("Do not know how to custom type legalize this operation!");
33776 case X86ISD::CVTPH2PS: {
33777 EVT VT = N->getValueType(0);
33778 SDValue Lo, Hi;
33779 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33780 EVT LoVT, HiVT;
33781 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33782 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
33783 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
33784 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33785 Results.push_back(Res);
33786 return;
33787 }
33789 EVT VT = N->getValueType(0);
33790 SDValue Lo, Hi;
33791 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
33792 EVT LoVT, HiVT;
33793 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33794 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
33795 {N->getOperand(0), Lo});
33796 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
33797 {N->getOperand(0), Hi});
33798 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33799 Lo.getValue(1), Hi.getValue(1));
33800 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33801 Results.push_back(Res);
33802 Results.push_back(Chain);
33803 return;
33804 }
33805 case X86ISD::CVTPS2PH:
33806 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
33807 return;
33808 case ISD::CTPOP: {
33809 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33810 // If we have at most 32 active bits, then perform as i32 CTPOP.
33811 // TODO: Perform this in generic legalizer?
33812 KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
33813 unsigned LZ = Known.countMinLeadingZeros();
33814 unsigned TZ = Known.countMinTrailingZeros();
33815 if ((LZ + TZ) >= 32) {
33816 SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
33817 DAG.getShiftAmountConstant(TZ, MVT::i64, dl));
33818 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);
33819 Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);
33820 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);
33821 Results.push_back(Op);
33822 return;
33823 }
33824 // Use a v2i64 if possible.
33825 bool NoImplicitFloatOps =
33827 Attribute::NoImplicitFloat);
33828 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
33829 SDValue Wide =
33830 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
33831 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
33832 // Bit count should fit in 32-bits, extract it as that and then zero
33833 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
33834 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
33835 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
33836 DAG.getVectorIdxConstant(0, dl));
33837 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
33838 Results.push_back(Wide);
33839 }
33840 return;
33841 }
33842 case ISD::MUL: {
33843 EVT VT = N->getValueType(0);
33845 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
33846 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
33847 // elements are needed.
33848 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
33849 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
33850 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
33851 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
33852 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33853 unsigned NumConcats = 16 / VT.getVectorNumElements();
33854 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
33855 ConcatOps[0] = Res;
33856 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
33857 Results.push_back(Res);
33858 return;
33859 }
33860 case ISD::SMULO:
33861 case ISD::UMULO: {
33862 EVT VT = N->getValueType(0);
33864 VT == MVT::v2i32 && "Unexpected VT!");
33865 bool IsSigned = Opc == ISD::SMULO;
33866 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
33867 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
33868 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
33869 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
33870 // Extract the high 32 bits from each result using PSHUFD.
33871 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
33872 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
33873 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
33874 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
33875 DAG.getVectorIdxConstant(0, dl));
33876
33877 // Truncate the low bits of the result. This will become PSHUFD.
33878 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33879
33880 SDValue HiCmp;
33881 if (IsSigned) {
33882 // SMULO overflows if the high bits don't match the sign of the low.
33883 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
33884 } else {
33885 // UMULO overflows if the high bits are non-zero.
33886 HiCmp = DAG.getConstant(0, dl, VT);
33887 }
33888 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
33889
33890 // Widen the result with by padding with undef.
33891 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
33892 DAG.getUNDEF(VT));
33893 Results.push_back(Res);
33894 Results.push_back(Ovf);
33895 return;
33896 }
33897 case X86ISD::VPMADDWD: {
33898 // Legalize types for X86ISD::VPMADDWD by widening.
33899 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33900
33901 EVT VT = N->getValueType(0);
33902 EVT InVT = N->getOperand(0).getValueType();
33903 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
33904 "Expected a VT that divides into 128 bits.");
33906 "Unexpected type action!");
33907 unsigned NumConcat = 128 / InVT.getSizeInBits();
33908
33909 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
33910 InVT.getVectorElementType(),
33911 NumConcat * InVT.getVectorNumElements());
33912 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
33914 NumConcat * VT.getVectorNumElements());
33915
33916 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
33917 Ops[0] = N->getOperand(0);
33918 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33919 Ops[0] = N->getOperand(1);
33920 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33921
33922 SDValue Res = DAG.getNode(Opc, dl, WideVT, InVec0, InVec1);
33923 Results.push_back(Res);
33924 return;
33925 }
33926 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
33927 case X86ISD::FMINC:
33928 case X86ISD::FMIN:
33929 case X86ISD::FMAXC:
33930 case X86ISD::FMAX:
33932 case X86ISD::STRICT_FMAX: {
33933 EVT VT = N->getValueType(0);
33934 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
33935 bool IsStrict = Opc == X86ISD::STRICT_FMIN || Opc == X86ISD::STRICT_FMAX;
33936 SDValue UNDEF = DAG.getUNDEF(VT);
33937 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33938 N->getOperand(IsStrict ? 1 : 0), UNDEF);
33939 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33940 N->getOperand(IsStrict ? 2 : 1), UNDEF);
33941 SDValue Res;
33942 if (IsStrict)
33943 Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
33944 {N->getOperand(0), LHS, RHS});
33945 else
33946 Res = DAG.getNode(Opc, dl, MVT::v4f32, LHS, RHS);
33947 Results.push_back(Res);
33948 if (IsStrict)
33949 Results.push_back(Res.getValue(1));
33950 return;
33951 }
33952 case ISD::SDIV:
33953 case ISD::UDIV:
33954 case ISD::SREM:
33955 case ISD::UREM: {
33956 EVT VT = N->getValueType(0);
33957 if (VT.isVector()) {
33959 "Unexpected type action!");
33960 // If this RHS is a constant splat vector we can widen this and let
33961 // division/remainder by constant optimize it.
33962 // TODO: Can we do something for non-splat?
33963 APInt SplatVal;
33964 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
33965 unsigned NumConcats = 128 / VT.getSizeInBits();
33966 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
33967 Ops0[0] = N->getOperand(0);
33968 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
33969 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
33970 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
33971 SDValue Res = DAG.getNode(Opc, dl, ResVT, N0, N1);
33972 Results.push_back(Res);
33973 }
33974 return;
33975 }
33976
33977 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
33978 Results.push_back(V);
33979 return;
33980 }
33981 case ISD::TRUNCATE: {
33982 MVT VT = N->getSimpleValueType(0);
33983 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
33984 return;
33985
33986 // The generic legalizer will try to widen the input type to the same
33987 // number of elements as the widened result type. But this isn't always
33988 // the best thing so do some custom legalization to avoid some cases.
33989 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
33990 SDValue In = N->getOperand(0);
33991 EVT InVT = In.getValueType();
33992 EVT InEltVT = InVT.getVectorElementType();
33993 EVT EltVT = VT.getVectorElementType();
33994 unsigned MinElts = VT.getVectorNumElements();
33995 unsigned WidenNumElts = WidenVT.getVectorNumElements();
33996 unsigned InBits = InVT.getSizeInBits();
33997
33998 // See if there are sufficient leading bits to perform a PACKUS/PACKSS.
33999 unsigned PackOpcode;
34000 if (SDValue Src = matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG,
34001 Subtarget, N->getFlags())) {
34002 if (SDValue Res =
34003 truncateVectorWithPACK(PackOpcode, VT, Src, dl, DAG, Subtarget)) {
34004 Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
34005 Results.push_back(Res);
34006 return;
34007 }
34008 }
34009
34010 if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
34011 // 128 bit and smaller inputs should avoid truncate all together and
34012 // use a shuffle.
34013 if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
34014 int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
34015 SmallVector<int, 16> TruncMask(WidenNumElts, -1);
34016 for (unsigned I = 0; I < MinElts; ++I)
34017 TruncMask[I] = Scale * I;
34018 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
34019 assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
34020 "Illegal vector type in truncation");
34021 WidenIn = DAG.getBitcast(WidenVT, WidenIn);
34022 Results.push_back(
34023 DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
34024 return;
34025 }
34026 }
34027
34028 // With AVX512 there are some cases that can use a target specific
34029 // truncate node to go from 256/512 to less than 128 with zeros in the
34030 // upper elements of the 128 bit result.
34031 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
34032 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
34033 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
34034 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34035 return;
34036 }
34037 // There's one case we can widen to 512 bits and use VTRUNC.
34038 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
34039 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
34040 DAG.getUNDEF(MVT::v4i64));
34041 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34042 return;
34043 }
34044 }
34045 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
34046 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
34047 isTypeLegal(MVT::v4i64)) {
34048 // Input needs to be split and output needs to widened. Let's use two
34049 // VTRUNCs, and shuffle their results together into the wider type.
34050 SDValue Lo, Hi;
34051 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
34052
34053 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
34054 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
34055 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
34056 { 0, 1, 2, 3, 16, 17, 18, 19,
34057 -1, -1, -1, -1, -1, -1, -1, -1 });
34058 Results.push_back(Res);
34059 return;
34060 }
34061
34062 // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
34063 // this via type legalization.
34064 if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
34065 (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
34066 (!Subtarget.hasSSSE3() ||
34067 (!isTypeLegal(InVT) &&
34068 !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {
34069 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
34070 InEltVT.getSizeInBits() * WidenNumElts);
34071 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
34072 return;
34073 }
34074
34075 return;
34076 }
34077 case ISD::ANY_EXTEND:
34078 // Right now, only MVT::v8i8 has Custom action for an illegal type.
34079 // It's intended to custom handle the input type.
34080 assert(N->getValueType(0) == MVT::v8i8 &&
34081 "Do not know how to legalize this Node");
34082 return;
34083 case ISD::SIGN_EXTEND:
34084 case ISD::ZERO_EXTEND: {
34085 EVT VT = N->getValueType(0);
34086 SDValue In = N->getOperand(0);
34087 EVT InVT = In.getValueType();
34088 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
34089 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
34091 "Unexpected type action!");
34092 assert(Opc == ISD::SIGN_EXTEND && "Unexpected opcode");
34093 // Custom split this so we can extend i8/i16->i32 invec. This is better
34094 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
34095 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
34096 // we allow the sra from the extend to i32 to be shared by the split.
34097 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
34098
34099 // Fill a vector with sign bits for each element.
34100 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
34101 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
34102
34103 // Create an unpackl and unpackh to interleave the sign bits then bitcast
34104 // to v2i64.
34105 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34106 {0, 4, 1, 5});
34107 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
34108 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34109 {2, 6, 3, 7});
34110 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
34111
34112 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34113 Results.push_back(Res);
34114 return;
34115 }
34116
34117 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
34118 if (!InVT.is128BitVector()) {
34119 // Not a 128 bit vector, but maybe type legalization will promote
34120 // it to 128 bits.
34121 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
34122 return;
34123 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
34124 if (!InVT.is128BitVector())
34125 return;
34126
34127 // Promote the input to 128 bits. Type legalization will turn this into
34128 // zext_inreg/sext_inreg.
34129 In = DAG.getNode(Opc, dl, InVT, In);
34130 }
34131
34132 // Perform custom splitting instead of the two stage extend we would get
34133 // by default.
34134 EVT LoVT, HiVT;
34135 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
34136 assert(isTypeLegal(LoVT) && "Split VT not legal?");
34137
34138 SDValue Lo = getEXTEND_VECTOR_INREG(Opc, dl, LoVT, In, DAG);
34139
34140 // We need to shift the input over by half the number of elements.
34141 unsigned NumElts = InVT.getVectorNumElements();
34142 unsigned HalfNumElts = NumElts / 2;
34143 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
34144 for (unsigned i = 0; i != HalfNumElts; ++i)
34145 ShufMask[i] = i + HalfNumElts;
34146
34147 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
34148 Hi = getEXTEND_VECTOR_INREG(Opc, dl, HiVT, Hi, DAG);
34149
34150 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34151 Results.push_back(Res);
34152 }
34153 return;
34154 }
34156 case ISD::FP_TO_UINT_SAT: {
34157 if (!Subtarget.hasAVX10_2())
34158 return;
34159
34160 bool IsSigned = Opc == ISD::FP_TO_SINT_SAT;
34161 EVT VT = N->getValueType(0);
34162 SDValue Op = N->getOperand(0);
34163 EVT OpVT = Op.getValueType();
34164 SDValue Res;
34165
34166 if (VT == MVT::v2i32 && OpVT == MVT::v2f64) {
34167 if (IsSigned)
34168 Res = DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v4i32, Op);
34169 else
34170 Res = DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v4i32, Op);
34171 Results.push_back(Res);
34172 }
34173 return;
34174 }
34175 case ISD::FP_TO_SINT:
34177 case ISD::FP_TO_UINT:
34179 bool IsStrict = N->isStrictFPOpcode();
34180 bool IsSigned = Opc == ISD::FP_TO_SINT || Opc == ISD::STRICT_FP_TO_SINT;
34181 EVT VT = N->getValueType(0);
34182 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34183 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34184 EVT SrcVT = Src.getValueType();
34185
34186 SDValue Res;
34187 if (isSoftF16(SrcVT, Subtarget)) {
34188 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
34189 if (IsStrict) {
34190 Res =
34191 DAG.getNode(Opc, dl, {VT, MVT::Other},
34192 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
34193 {NVT, MVT::Other}, {Chain, Src})});
34194 Chain = Res.getValue(1);
34195 } else {
34196 Res =
34197 DAG.getNode(Opc, dl, VT, DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
34198 }
34199 Results.push_back(Res);
34200 if (IsStrict)
34201 Results.push_back(Chain);
34202
34203 return;
34204 }
34205
34206 if (VT.isVector() && Subtarget.hasFP16() && Subtarget.hasVLX() &&
34207 SrcVT.getVectorElementType() == MVT::f16) {
34208 EVT EleVT = VT.getVectorElementType();
34209 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
34210
34211 if (SrcVT != MVT::v8f16) {
34212 SDValue Tmp =
34213 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
34214 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
34215 Ops[0] = Src;
34216 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
34217 }
34218
34219 if (IsStrict) {
34221 Res =
34222 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
34223 Chain = Res.getValue(1);
34224 } else {
34225 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34226 Res = DAG.getNode(Opc, dl, ResVT, Src);
34227 }
34228
34229 // TODO: Need to add exception check code for strict FP.
34230 if (EleVT.getSizeInBits() < 16) {
34231 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
34232 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
34233
34234 // Now widen to 128 bits.
34235 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
34236 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
34237 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
34238 ConcatOps[0] = Res;
34239 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34240 }
34241
34242 Results.push_back(Res);
34243 if (IsStrict)
34244 Results.push_back(Chain);
34245
34246 return;
34247 }
34248
34249 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
34251 "Unexpected type action!");
34252
34253 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
34254 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
34255 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
34257 SDValue Res;
34258 SDValue Chain;
34259 if (IsStrict) {
34260 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
34261 {N->getOperand(0), Src});
34262 Chain = Res.getValue(1);
34263 } else
34264 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
34265
34266 // Preserve what we know about the size of the original result. If the
34267 // result is v2i32, we have to manually widen the assert.
34268 if (PromoteVT == MVT::v2i32)
34269 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
34270 DAG.getUNDEF(MVT::v2i32));
34271
34272 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
34273 Res.getValueType(), Res,
34275
34276 if (PromoteVT == MVT::v2i32)
34277 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
34278 DAG.getVectorIdxConstant(0, dl));
34279
34280 // Truncate back to the original width.
34281 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34282
34283 // Now widen to 128 bits.
34284 unsigned NumConcats = 128 / VT.getSizeInBits();
34286 VT.getVectorNumElements() * NumConcats);
34287 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
34288 ConcatOps[0] = Res;
34289 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34290 Results.push_back(Res);
34291 if (IsStrict)
34292 Results.push_back(Chain);
34293 return;
34294 }
34295
34296
34297 if (VT == MVT::v2i32) {
34298 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
34299 "Strict unsigned conversion requires AVX512");
34300 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34302 "Unexpected type action!");
34303 if (Src.getValueType() == MVT::v2f64) {
34304 if (!IsSigned && !Subtarget.hasAVX512()) {
34305 SDValue Res =
34306 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
34307 Results.push_back(Res);
34308 return;
34309 }
34310
34311 if (IsStrict)
34313 else
34314 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34315
34316 // If we have VLX we can emit a target specific FP_TO_UINT node,.
34317 if (!IsSigned && !Subtarget.hasVLX()) {
34318 // Otherwise we can defer to the generic legalizer which will widen
34319 // the input as well. This will be further widened during op
34320 // legalization to v8i32<-v8f64.
34321 // For strict nodes we'll need to widen ourselves.
34322 // FIXME: Fix the type legalizer to safely widen strict nodes?
34323 if (!IsStrict)
34324 return;
34325 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
34326 DAG.getConstantFP(0.0, dl, MVT::v2f64));
34327 Opc = N->getOpcode();
34328 }
34329 SDValue Res;
34330 SDValue Chain;
34331 if (IsStrict) {
34332 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34333 {N->getOperand(0), Src});
34334 Chain = Res.getValue(1);
34335 } else {
34336 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
34337 }
34338 Results.push_back(Res);
34339 if (IsStrict)
34340 Results.push_back(Chain);
34341 return;
34342 }
34343
34344 // Custom widen strict v2f32->v2i32 by padding with zeros.
34345 // FIXME: Should generic type legalizer do this?
34346 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
34347 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
34348 DAG.getConstantFP(0.0, dl, MVT::v2f32));
34349 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34350 {N->getOperand(0), Src});
34351 Results.push_back(Res);
34352 Results.push_back(Res.getValue(1));
34353 return;
34354 }
34355
34356 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
34357 // so early out here.
34358 return;
34359 }
34360
34361 assert(!VT.isVector() && "Vectors should have been handled above!");
34362
34363 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
34364 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
34365 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
34366 assert(!Subtarget.is64Bit() && "i64 should be legal");
34367 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
34368 // If we use a 128-bit result we might need to use a target specific node.
34369 unsigned SrcElts =
34370 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
34371 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
34372 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
34373 if (NumElts != SrcElts) {
34374 if (IsStrict)
34376 else
34377 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34378 }
34379
34380 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
34381 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
34382 DAG.getConstantFP(0.0, dl, VecInVT), Src,
34383 ZeroIdx);
34384 SDValue Chain;
34385 if (IsStrict) {
34386 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
34387 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
34388 Chain = Res.getValue(1);
34389 } else
34390 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
34391 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
34392 Results.push_back(Res);
34393 if (IsStrict)
34394 Results.push_back(Chain);
34395 return;
34396 }
34397
34398 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
34399 SDValue Chain;
34400 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
34401 Results.push_back(V);
34402 if (IsStrict)
34403 Results.push_back(Chain);
34404 return;
34405 }
34406
34407 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
34408 Results.push_back(V);
34409 if (IsStrict)
34410 Results.push_back(Chain);
34411 }
34412 return;
34413 }
34414 case ISD::LRINT:
34415 if (N->getValueType(0) == MVT::v2i32) {
34416 SDValue Src = N->getOperand(0);
34417 if (Subtarget.hasFP16() && Src.getValueType() == MVT::v2f16) {
34418 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src,
34419 DAG.getUNDEF(MVT::v2f16));
34420 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Src,
34421 DAG.getUNDEF(MVT::v4f16));
34422 } else if (Src.getValueType() != MVT::v2f64) {
34423 return;
34424 }
34425 Results.push_back(DAG.getNode(X86ISD::CVTP2SI, dl, MVT::v4i32, Src));
34426 return;
34427 }
34428 [[fallthrough]];
34429 case ISD::LLRINT: {
34430 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
34431 Results.push_back(V);
34432 return;
34433 }
34434
34435 case ISD::SINT_TO_FP:
34437 case ISD::UINT_TO_FP:
34439 bool IsStrict = N->isStrictFPOpcode();
34440 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
34441 EVT VT = N->getValueType(0);
34442 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34443 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
34444 Subtarget.hasVLX()) {
34445 if (Src.getValueType().getVectorElementType() == MVT::i16)
34446 return;
34447
34448 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
34449 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34450 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
34451 : DAG.getUNDEF(MVT::v2i32));
34452 if (IsStrict) {
34453 unsigned Opc =
34455 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
34456 {N->getOperand(0), Src});
34457 Results.push_back(Res);
34458 Results.push_back(Res.getValue(1));
34459 } else {
34460 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34461 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
34462 }
34463 return;
34464 }
34465 if (VT != MVT::v2f32)
34466 return;
34467 EVT SrcVT = Src.getValueType();
34468 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
34469 if (IsStrict) {
34470 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
34472 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34473 {N->getOperand(0), Src});
34474 Results.push_back(Res);
34475 Results.push_back(Res.getValue(1));
34476 } else {
34477 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34478 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
34479 }
34480 return;
34481 }
34482 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
34483 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
34484 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
34485 SDValue One = DAG.getConstant(1, dl, SrcVT);
34486 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
34487 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
34488 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
34489 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
34490 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
34491 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
34492 for (int i = 0; i != 2; ++i) {
34493 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
34494 SignSrc, DAG.getVectorIdxConstant(i, dl));
34495 if (IsStrict)
34496 SignCvts[i] =
34497 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
34498 {N->getOperand(0), Elt});
34499 else
34500 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
34501 };
34502 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
34503 SDValue Slow, Chain;
34504 if (IsStrict) {
34505 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34506 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
34507 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
34508 {Chain, SignCvt, SignCvt});
34509 Chain = Slow.getValue(1);
34510 } else {
34511 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
34512 }
34513 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
34514 IsNeg =
34515 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
34516 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
34517 Results.push_back(Cvt);
34518 if (IsStrict)
34519 Results.push_back(Chain);
34520 return;
34521 }
34522
34523 if (SrcVT != MVT::v2i32)
34524 return;
34525
34526 if (IsSigned || Subtarget.hasAVX512()) {
34527 if (!IsStrict)
34528 return;
34529
34530 // Custom widen strict v2i32->v2f32 to avoid scalarization.
34531 // FIXME: Should generic type legalizer do this?
34532 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34533 DAG.getConstant(0, dl, MVT::v2i32));
34534 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34535 {N->getOperand(0), Src});
34536 Results.push_back(Res);
34537 Results.push_back(Res.getValue(1));
34538 return;
34539 }
34540
34541 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34542 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
34543 SDValue VBias = DAG.getConstantFP(
34544 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
34545 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
34546 DAG.getBitcast(MVT::v2i64, VBias));
34547 Or = DAG.getBitcast(MVT::v2f64, Or);
34548 if (IsStrict) {
34549 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
34550 {N->getOperand(0), Or, VBias});
34552 {MVT::v4f32, MVT::Other},
34553 {Sub.getValue(1), Sub});
34554 Results.push_back(Res);
34555 Results.push_back(Res.getValue(1));
34556 } else {
34557 // TODO: Are there any fast-math-flags to propagate here?
34558 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
34559 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
34560 }
34561 return;
34562 }
34564 case ISD::FP_ROUND: {
34565 bool IsStrict = N->isStrictFPOpcode();
34566 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34567 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34568 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
34569 EVT SrcVT = Src.getValueType();
34570 EVT VT = N->getValueType(0);
34571 SDValue V;
34572 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
34573 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
34574 : DAG.getUNDEF(MVT::v2f32);
34575 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
34576 }
34577 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
34578 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
34579 if (SrcVT.getVectorElementType() != MVT::f32)
34580 return;
34581
34582 if (IsStrict)
34583 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
34584 {Chain, Src, Rnd});
34585 else
34586 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
34587
34588 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
34589 if (IsStrict)
34590 Results.push_back(V.getValue(1));
34591 return;
34592 }
34593 if (!isTypeLegal(Src.getValueType()))
34594 return;
34595 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
34596 if (IsStrict)
34597 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
34598 {Chain, Src});
34599 else
34600 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
34601 Results.push_back(V);
34602 if (IsStrict)
34603 Results.push_back(V.getValue(1));
34604 return;
34605 }
34606 case ISD::FP_EXTEND:
34607 case ISD::STRICT_FP_EXTEND: {
34608 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
34609 // No other ValueType for FP_EXTEND should reach this point.
34610 assert(N->getValueType(0) == MVT::v2f32 &&
34611 "Do not know how to legalize this Node");
34612 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
34613 return;
34614 bool IsStrict = N->isStrictFPOpcode();
34615 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34616 if (Src.getValueType().getVectorElementType() != MVT::f16)
34617 return;
34618 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
34619 : DAG.getUNDEF(MVT::v2f16);
34620 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
34621 if (IsStrict)
34622 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
34623 {N->getOperand(0), V});
34624 else
34625 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
34626 Results.push_back(V);
34627 if (IsStrict)
34628 Results.push_back(V.getValue(1));
34629 return;
34630 }
34632 unsigned IntNo = N->getConstantOperandVal(1);
34633 switch (IntNo) {
34634 default : llvm_unreachable("Do not know how to custom type "
34635 "legalize this intrinsic operation!");
34636 case Intrinsic::x86_rdtsc:
34637 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
34638 Results);
34639 case Intrinsic::x86_rdtscp:
34640 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
34641 Results);
34642 case Intrinsic::x86_rdpmc:
34643 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
34644 Results);
34645 return;
34646 case Intrinsic::x86_rdpru:
34647 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
34648 Results);
34649 return;
34650 case Intrinsic::x86_xgetbv:
34651 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
34652 Results);
34653 return;
34654 }
34655 }
34656 case ISD::READCYCLECOUNTER: {
34657 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
34658 }
34659 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
34660 EVT T = N->getValueType(0);
34661 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
34662 bool Regs64bit = T == MVT::i128;
34663 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
34664 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
34665 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
34666 SDValue cpInL, cpInH;
34667 std::tie(cpInL, cpInH) =
34668 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
34669 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
34670 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
34671 cpInH =
34672 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
34673 cpInH, cpInL.getValue(1));
34674 SDValue swapInL, swapInH;
34675 std::tie(swapInL, swapInH) =
34676 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
34677 swapInH =
34678 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
34679 swapInH, cpInH.getValue(1));
34680
34681 // In 64-bit mode we might need the base pointer in RBX, but we can't know
34682 // until later. So we keep the RBX input in a vreg and use a custom
34683 // inserter.
34684 // Since RBX will be a reserved register the register allocator will not
34685 // make sure its value will be properly saved and restored around this
34686 // live-range.
34687 SDValue Result;
34688 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
34689 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
34690 if (Regs64bit) {
34691 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
34692 swapInH.getValue(1)};
34693 Result =
34694 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
34695 } else {
34696 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
34697 swapInH.getValue(1));
34698 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
34699 swapInL.getValue(1)};
34700 Result =
34701 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
34702 }
34703
34704 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
34705 Regs64bit ? X86::RAX : X86::EAX,
34706 HalfT, Result.getValue(1));
34707 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
34708 Regs64bit ? X86::RDX : X86::EDX,
34709 HalfT, cpOutL.getValue(2));
34710 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
34711
34712 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
34713 MVT::i32, cpOutH.getValue(2));
34714 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
34715 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
34716
34717 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
34718 Results.push_back(Success);
34719 Results.push_back(EFLAGS.getValue(1));
34720 return;
34721 }
34722 case ISD::ATOMIC_LOAD: {
34723 assert(
34724 (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
34725 "Unexpected VT!");
34726 bool NoImplicitFloatOps =
34728 Attribute::NoImplicitFloat);
34729 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
34730 auto *Node = cast<AtomicSDNode>(N);
34731
34732 if (N->getValueType(0) == MVT::i128) {
34733 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
34734 SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
34735 Node->getBasePtr(), Node->getMemOperand());
34736 SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34737 DAG.getVectorIdxConstant(0, dl));
34738 SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34739 DAG.getVectorIdxConstant(1, dl));
34740 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
34741 {ResL, ResH}));
34742 Results.push_back(Ld.getValue(1));
34743 return;
34744 }
34745 break;
34746 }
34747 if (Subtarget.hasSSE1()) {
34748 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
34749 // Then extract the lower 64-bits.
34750 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
34751 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
34752 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34754 MVT::i64, Node->getMemOperand());
34755 if (Subtarget.hasSSE2()) {
34756 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34757 DAG.getVectorIdxConstant(0, dl));
34758 Results.push_back(Res);
34759 Results.push_back(Ld.getValue(1));
34760 return;
34761 }
34762 // We use an alternative sequence for SSE1 that extracts as v2f32 and
34763 // then casts to i64. This avoids a 128-bit stack temporary being
34764 // created by type legalization if we were to cast v4f32->v2i64.
34765 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
34766 DAG.getVectorIdxConstant(0, dl));
34767 Res = DAG.getBitcast(MVT::i64, Res);
34768 Results.push_back(Res);
34769 Results.push_back(Ld.getValue(1));
34770 return;
34771 }
34772 if (Subtarget.hasX87()) {
34773 // First load this into an 80-bit X87 register. This will put the whole
34774 // integer into the significand.
34775 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
34776 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34778 dl, Tys, Ops, MVT::i64,
34779 Node->getMemOperand());
34780 SDValue Chain = Result.getValue(1);
34781
34782 // Now store the X87 register to a stack temporary and convert to i64.
34783 // This store is not atomic and doesn't need to be.
34784 // FIXME: We don't need a stack temporary if the result of the load
34785 // is already being stored. We could just directly store there.
34786 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
34787 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
34788 MachinePointerInfo MPI =
34790 SDValue StoreOps[] = { Chain, Result, StackPtr };
34791 Chain = DAG.getMemIntrinsicNode(
34792 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
34793 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
34794
34795 // Finally load the value back from the stack temporary and return it.
34796 // This load is not atomic and doesn't need to be.
34797 // This load will be further type legalized.
34798 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
34799 Results.push_back(Result);
34800 Results.push_back(Result.getValue(1));
34801 return;
34802 }
34803 }
34804 // TODO: Use MOVLPS when SSE1 is available?
34805 // Delegate to generic TypeLegalization. Situations we can really handle
34806 // should have already been dealt with by AtomicExpandPass.cpp.
34807 break;
34808 }
34809 case ISD::ATOMIC_SWAP:
34810 case ISD::ATOMIC_LOAD_ADD:
34811 case ISD::ATOMIC_LOAD_SUB:
34812 case ISD::ATOMIC_LOAD_AND:
34813 case ISD::ATOMIC_LOAD_OR:
34814 case ISD::ATOMIC_LOAD_XOR:
34815 case ISD::ATOMIC_LOAD_NAND:
34816 case ISD::ATOMIC_LOAD_MIN:
34817 case ISD::ATOMIC_LOAD_MAX:
34818 case ISD::ATOMIC_LOAD_UMIN:
34819 case ISD::ATOMIC_LOAD_UMAX:
34820 // Delegate to generic TypeLegalization. Situations we can really handle
34821 // should have already been dealt with by AtomicExpandPass.cpp.
34822 break;
34823
34824 case ISD::BITCAST: {
34825 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34826 EVT DstVT = N->getValueType(0);
34827 EVT SrcVT = N->getOperand(0).getValueType();
34828
34829 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
34830 // we can split using the k-register rather than memory.
34831 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
34832 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
34833 SDValue Lo, Hi;
34834 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
34835 Lo = DAG.getBitcast(MVT::i32, Lo);
34836 Hi = DAG.getBitcast(MVT::i32, Hi);
34837 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
34838 Results.push_back(Res);
34839 return;
34840 }
34841
34842 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
34843 // FIXME: Use v4f32 for SSE1?
34844 assert(Subtarget.hasSSE2() && "Requires SSE2");
34845 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
34846 "Unexpected type action!");
34847 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
34848 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
34849 N->getOperand(0));
34850 Res = DAG.getBitcast(WideVT, Res);
34851 Results.push_back(Res);
34852 return;
34853 }
34854
34855 return;
34856 }
34857 case ISD::MGATHER: {
34858 EVT VT = N->getValueType(0);
34859 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
34860 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
34861 auto *Gather = cast<MaskedGatherSDNode>(N);
34862 SDValue Index = Gather->getIndex();
34863 if (Index.getValueType() != MVT::v2i64)
34864 return;
34866 "Unexpected type action!");
34867 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34868 SDValue Mask = Gather->getMask();
34869 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
34870 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
34871 Gather->getPassThru(),
34872 DAG.getUNDEF(VT));
34873 if (!Subtarget.hasVLX()) {
34874 // We need to widen the mask, but the instruction will only use 2
34875 // of its elements. So we can use undef.
34876 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
34877 DAG.getUNDEF(MVT::v2i1));
34878 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
34879 }
34880 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
34881 Gather->getBasePtr(), Index, Gather->getScale() };
34882 SDValue Res = DAG.getMemIntrinsicNode(
34883 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
34884 Gather->getMemoryVT(), Gather->getMemOperand());
34885 Results.push_back(Res);
34886 Results.push_back(Res.getValue(1));
34887 return;
34888 }
34889 return;
34890 }
34891 case ISD::LOAD: {
34892 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
34893 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
34894 // cast since type legalization will try to use an i64 load.
34895 MVT VT = N->getSimpleValueType(0);
34896 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
34898 "Unexpected type action!");
34899 if (!ISD::isNON_EXTLoad(N))
34900 return;
34901 auto *Ld = cast<LoadSDNode>(N);
34902 if (Subtarget.hasSSE2()) {
34903 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
34904 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
34905 Ld->getPointerInfo(), Ld->getBaseAlign(),
34906 Ld->getMemOperand()->getFlags());
34907 SDValue Chain = Res.getValue(1);
34908 MVT VecVT = MVT::getVectorVT(LdVT, 2);
34909 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
34910 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34911 Res = DAG.getBitcast(WideVT, Res);
34912 Results.push_back(Res);
34913 Results.push_back(Chain);
34914 return;
34915 }
34916 assert(Subtarget.hasSSE1() && "Expected SSE");
34917 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
34918 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
34920 MVT::i64, Ld->getMemOperand());
34921 Results.push_back(Res);
34922 Results.push_back(Res.getValue(1));
34923 return;
34924 }
34925 case ISD::ADDRSPACECAST: {
34926 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
34927 Results.push_back(V);
34928 return;
34929 }
34930 case ISD::BITREVERSE: {
34931 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
34932 assert((Subtarget.hasXOP() || Subtarget.hasGFNI()) && "Expected XOP/GFNI");
34933 // We can use VPPERM/GF2P8AFFINEQB by copying to a vector register and back.
34934 // We'll need to move the scalar in two i32 pieces.
34935 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
34936 return;
34937 }
34939 // f16 = extract vXf16 %vec, i64 %idx
34940 assert(N->getSimpleValueType(0) == MVT::f16 &&
34941 "Unexpected Value type of EXTRACT_VECTOR_ELT!");
34942 assert(Subtarget.hasFP16() && "Expected FP16");
34943 SDValue VecOp = N->getOperand(0);
34945 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
34946 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
34947 N->getOperand(1));
34948 Split = DAG.getBitcast(MVT::f16, Split);
34949 Results.push_back(Split);
34950 return;
34951 }
34952 }
34953}
34954
34955const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
34956 switch ((X86ISD::NodeType)Opcode) {
34957 case X86ISD::FIRST_NUMBER: break;
34958#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
34959 NODE_NAME_CASE(BSF)
34960 NODE_NAME_CASE(BSR)
34961 NODE_NAME_CASE(FSHL)
34962 NODE_NAME_CASE(FSHR)
34963 NODE_NAME_CASE(FAND)
34964 NODE_NAME_CASE(FANDN)
34965 NODE_NAME_CASE(FOR)
34966 NODE_NAME_CASE(FXOR)
34967 NODE_NAME_CASE(FILD)
34968 NODE_NAME_CASE(FIST)
34969 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
34970 NODE_NAME_CASE(FLD)
34971 NODE_NAME_CASE(FST)
34972 NODE_NAME_CASE(CALL)
34973 NODE_NAME_CASE(CALL_RVMARKER)
34974 NODE_NAME_CASE(IMP_CALL)
34976 NODE_NAME_CASE(CMP)
34977 NODE_NAME_CASE(FCMP)
34978 NODE_NAME_CASE(STRICT_FCMP)
34979 NODE_NAME_CASE(STRICT_FCMPS)
34981 NODE_NAME_CASE(UCOMI)
34982 NODE_NAME_CASE(COMX)
34983 NODE_NAME_CASE(UCOMX)
34984 NODE_NAME_CASE(CMPM)
34985 NODE_NAME_CASE(CMPMM)
34986 NODE_NAME_CASE(STRICT_CMPM)
34987 NODE_NAME_CASE(CMPMM_SAE)
34988 NODE_NAME_CASE(SETCC)
34989 NODE_NAME_CASE(SETCC_CARRY)
34990 NODE_NAME_CASE(FSETCC)
34991 NODE_NAME_CASE(FSETCCM)
34992 NODE_NAME_CASE(FSETCCM_SAE)
34993 NODE_NAME_CASE(CMOV)
34994 NODE_NAME_CASE(BRCOND)
34995 NODE_NAME_CASE(RET_GLUE)
34996 NODE_NAME_CASE(IRET)
34997 NODE_NAME_CASE(REP_STOS)
34998 NODE_NAME_CASE(REP_MOVS)
34999 NODE_NAME_CASE(GlobalBaseReg)
35001 NODE_NAME_CASE(WrapperRIP)
35002 NODE_NAME_CASE(MOVQ2DQ)
35003 NODE_NAME_CASE(MOVDQ2Q)
35004 NODE_NAME_CASE(MMX_MOVD2W)
35005 NODE_NAME_CASE(MMX_MOVW2D)
35006 NODE_NAME_CASE(PEXTRB)
35007 NODE_NAME_CASE(PEXTRW)
35008 NODE_NAME_CASE(INSERTPS)
35009 NODE_NAME_CASE(PINSRB)
35010 NODE_NAME_CASE(PINSRW)
35011 NODE_NAME_CASE(PSHUFB)
35012 NODE_NAME_CASE(ANDNP)
35013 NODE_NAME_CASE(BLENDI)
35015 NODE_NAME_CASE(HADD)
35016 NODE_NAME_CASE(HSUB)
35017 NODE_NAME_CASE(FHADD)
35018 NODE_NAME_CASE(FHSUB)
35019 NODE_NAME_CASE(CONFLICT)
35020 NODE_NAME_CASE(FMAX)
35021 NODE_NAME_CASE(FMAXS)
35022 NODE_NAME_CASE(FMAX_SAE)
35023 NODE_NAME_CASE(FMAXS_SAE)
35024 NODE_NAME_CASE(STRICT_FMAX)
35025 NODE_NAME_CASE(FMIN)
35026 NODE_NAME_CASE(FMINS)
35027 NODE_NAME_CASE(FMIN_SAE)
35028 NODE_NAME_CASE(FMINS_SAE)
35029 NODE_NAME_CASE(STRICT_FMIN)
35030 NODE_NAME_CASE(FMAXC)
35031 NODE_NAME_CASE(FMINC)
35032 NODE_NAME_CASE(FRSQRT)
35033 NODE_NAME_CASE(FRCP)
35034 NODE_NAME_CASE(EXTRQI)
35035 NODE_NAME_CASE(INSERTQI)
35036 NODE_NAME_CASE(TLSADDR)
35037 NODE_NAME_CASE(TLSBASEADDR)
35038 NODE_NAME_CASE(TLSCALL)
35039 NODE_NAME_CASE(TLSDESC)
35040 NODE_NAME_CASE(EH_SJLJ_SETJMP)
35041 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
35042 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
35043 NODE_NAME_CASE(EH_RETURN)
35044 NODE_NAME_CASE(TC_RETURN)
35045 NODE_NAME_CASE(FNSTCW16m)
35046 NODE_NAME_CASE(FLDCW16m)
35047 NODE_NAME_CASE(FNSTENVm)
35048 NODE_NAME_CASE(FLDENVm)
35049 NODE_NAME_CASE(LCMPXCHG_DAG)
35050 NODE_NAME_CASE(LCMPXCHG8_DAG)
35051 NODE_NAME_CASE(LCMPXCHG16_DAG)
35052 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
35053 NODE_NAME_CASE(LADD)
35054 NODE_NAME_CASE(LSUB)
35055 NODE_NAME_CASE(LOR)
35056 NODE_NAME_CASE(LXOR)
35057 NODE_NAME_CASE(LAND)
35058 NODE_NAME_CASE(LBTS)
35059 NODE_NAME_CASE(LBTC)
35060 NODE_NAME_CASE(LBTR)
35061 NODE_NAME_CASE(LBTS_RM)
35062 NODE_NAME_CASE(LBTC_RM)
35063 NODE_NAME_CASE(LBTR_RM)
35064 NODE_NAME_CASE(AADD)
35065 NODE_NAME_CASE(AOR)
35066 NODE_NAME_CASE(AXOR)
35067 NODE_NAME_CASE(AAND)
35068 NODE_NAME_CASE(VZEXT_MOVL)
35069 NODE_NAME_CASE(VZEXT_LOAD)
35070 NODE_NAME_CASE(VEXTRACT_STORE)
35071 NODE_NAME_CASE(VTRUNC)
35072 NODE_NAME_CASE(VTRUNCS)
35073 NODE_NAME_CASE(VTRUNCUS)
35074 NODE_NAME_CASE(VMTRUNC)
35075 NODE_NAME_CASE(VMTRUNCS)
35076 NODE_NAME_CASE(VMTRUNCUS)
35077 NODE_NAME_CASE(VTRUNCSTORES)
35078 NODE_NAME_CASE(VTRUNCSTOREUS)
35079 NODE_NAME_CASE(VMTRUNCSTORES)
35080 NODE_NAME_CASE(VMTRUNCSTOREUS)
35081 NODE_NAME_CASE(VFPEXT)
35082 NODE_NAME_CASE(STRICT_VFPEXT)
35083 NODE_NAME_CASE(VFPEXT_SAE)
35084 NODE_NAME_CASE(VFPEXTS)
35085 NODE_NAME_CASE(VFPEXTS_SAE)
35086 NODE_NAME_CASE(VFPROUND)
35087 NODE_NAME_CASE(VFPROUND2)
35088 NODE_NAME_CASE(VFPROUND2_RND)
35089 NODE_NAME_CASE(STRICT_VFPROUND)
35090 NODE_NAME_CASE(VMFPROUND)
35091 NODE_NAME_CASE(VFPROUND_RND)
35092 NODE_NAME_CASE(VFPROUNDS)
35093 NODE_NAME_CASE(VFPROUNDS_RND)
35094 NODE_NAME_CASE(VSHLDQ)
35095 NODE_NAME_CASE(VSRLDQ)
35096 NODE_NAME_CASE(VSHL)
35097 NODE_NAME_CASE(VSRL)
35098 NODE_NAME_CASE(VSRA)
35099 NODE_NAME_CASE(VSHLI)
35100 NODE_NAME_CASE(VSRLI)
35101 NODE_NAME_CASE(VSRAI)
35102 NODE_NAME_CASE(VSHLV)
35103 NODE_NAME_CASE(VSRLV)
35104 NODE_NAME_CASE(VSRAV)
35105 NODE_NAME_CASE(VROTLI)
35106 NODE_NAME_CASE(VROTRI)
35107 NODE_NAME_CASE(VPPERM)
35108 NODE_NAME_CASE(CMPP)
35109 NODE_NAME_CASE(STRICT_CMPP)
35110 NODE_NAME_CASE(PCMPEQ)
35111 NODE_NAME_CASE(PCMPGT)
35112 NODE_NAME_CASE(PHMINPOS)
35113 NODE_NAME_CASE(ADD)
35114 NODE_NAME_CASE(SUB)
35115 NODE_NAME_CASE(ADC)
35116 NODE_NAME_CASE(SBB)
35117 NODE_NAME_CASE(SMUL)
35118 NODE_NAME_CASE(UMUL)
35119 NODE_NAME_CASE(OR)
35120 NODE_NAME_CASE(XOR)
35121 NODE_NAME_CASE(AND)
35122 NODE_NAME_CASE(BEXTR)
35124 NODE_NAME_CASE(BZHI)
35125 NODE_NAME_CASE(PDEP)
35126 NODE_NAME_CASE(PEXT)
35127 NODE_NAME_CASE(MUL_IMM)
35128 NODE_NAME_CASE(MOVMSK)
35129 NODE_NAME_CASE(PTEST)
35130 NODE_NAME_CASE(TESTP)
35131 NODE_NAME_CASE(KORTEST)
35132 NODE_NAME_CASE(KTEST)
35133 NODE_NAME_CASE(KADD)
35134 NODE_NAME_CASE(KSHIFTL)
35135 NODE_NAME_CASE(KSHIFTR)
35136 NODE_NAME_CASE(PACKSS)
35137 NODE_NAME_CASE(PACKUS)
35138 NODE_NAME_CASE(PALIGNR)
35139 NODE_NAME_CASE(VALIGN)
35140 NODE_NAME_CASE(VSHLD)
35141 NODE_NAME_CASE(VSHRD)
35142 NODE_NAME_CASE(PSHUFD)
35143 NODE_NAME_CASE(PSHUFHW)
35144 NODE_NAME_CASE(PSHUFLW)
35145 NODE_NAME_CASE(SHUFP)
35146 NODE_NAME_CASE(SHUF128)
35147 NODE_NAME_CASE(MOVLHPS)
35148 NODE_NAME_CASE(MOVHLPS)
35149 NODE_NAME_CASE(MOVDDUP)
35150 NODE_NAME_CASE(MOVSHDUP)
35151 NODE_NAME_CASE(MOVSLDUP)
35152 NODE_NAME_CASE(MOVSD)
35153 NODE_NAME_CASE(MOVSS)
35154 NODE_NAME_CASE(MOVSH)
35155 NODE_NAME_CASE(UNPCKL)
35156 NODE_NAME_CASE(UNPCKH)
35157 NODE_NAME_CASE(VBROADCAST)
35158 NODE_NAME_CASE(VBROADCAST_LOAD)
35159 NODE_NAME_CASE(VBROADCASTM)
35160 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
35161 NODE_NAME_CASE(VPERMILPV)
35162 NODE_NAME_CASE(VPERMILPI)
35163 NODE_NAME_CASE(VPERM2X128)
35164 NODE_NAME_CASE(VPERMV)
35165 NODE_NAME_CASE(VPERMV3)
35166 NODE_NAME_CASE(VPERMI)
35167 NODE_NAME_CASE(VPTERNLOG)
35168 NODE_NAME_CASE(FP_TO_SINT_SAT)
35169 NODE_NAME_CASE(FP_TO_UINT_SAT)
35170 NODE_NAME_CASE(VFIXUPIMM)
35171 NODE_NAME_CASE(VFIXUPIMM_SAE)
35172 NODE_NAME_CASE(VFIXUPIMMS)
35173 NODE_NAME_CASE(VFIXUPIMMS_SAE)
35174 NODE_NAME_CASE(VRANGE)
35175 NODE_NAME_CASE(VRANGE_SAE)
35176 NODE_NAME_CASE(VRANGES)
35177 NODE_NAME_CASE(VRANGES_SAE)
35178 NODE_NAME_CASE(PMULUDQ)
35179 NODE_NAME_CASE(PMULDQ)
35180 NODE_NAME_CASE(PSADBW)
35181 NODE_NAME_CASE(DBPSADBW)
35182 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
35183 NODE_NAME_CASE(VAARG_64)
35184 NODE_NAME_CASE(VAARG_X32)
35185 NODE_NAME_CASE(DYN_ALLOCA)
35186 NODE_NAME_CASE(MFENCE)
35187 NODE_NAME_CASE(SEG_ALLOCA)
35188 NODE_NAME_CASE(PROBED_ALLOCA)
35191 NODE_NAME_CASE(RDPKRU)
35192 NODE_NAME_CASE(WRPKRU)
35193 NODE_NAME_CASE(VPMADDUBSW)
35194 NODE_NAME_CASE(VPMADDWD)
35195 NODE_NAME_CASE(VPSHA)
35196 NODE_NAME_CASE(VPSHL)
35197 NODE_NAME_CASE(VPCOM)
35198 NODE_NAME_CASE(VPCOMU)
35199 NODE_NAME_CASE(VPERMIL2)
35201 NODE_NAME_CASE(STRICT_FMSUB)
35203 NODE_NAME_CASE(STRICT_FNMADD)
35205 NODE_NAME_CASE(STRICT_FNMSUB)
35206 NODE_NAME_CASE(FMADDSUB)
35207 NODE_NAME_CASE(FMSUBADD)
35208 NODE_NAME_CASE(FMADD_RND)
35209 NODE_NAME_CASE(FNMADD_RND)
35210 NODE_NAME_CASE(FMSUB_RND)
35211 NODE_NAME_CASE(FNMSUB_RND)
35212 NODE_NAME_CASE(FMADDSUB_RND)
35213 NODE_NAME_CASE(FMSUBADD_RND)
35214 NODE_NAME_CASE(VFMADDC)
35215 NODE_NAME_CASE(VFMADDC_RND)
35216 NODE_NAME_CASE(VFCMADDC)
35217 NODE_NAME_CASE(VFCMADDC_RND)
35218 NODE_NAME_CASE(VFMULC)
35219 NODE_NAME_CASE(VFMULC_RND)
35220 NODE_NAME_CASE(VFCMULC)
35221 NODE_NAME_CASE(VFCMULC_RND)
35222 NODE_NAME_CASE(VFMULCSH)
35223 NODE_NAME_CASE(VFMULCSH_RND)
35224 NODE_NAME_CASE(VFCMULCSH)
35225 NODE_NAME_CASE(VFCMULCSH_RND)
35226 NODE_NAME_CASE(VFMADDCSH)
35227 NODE_NAME_CASE(VFMADDCSH_RND)
35228 NODE_NAME_CASE(VFCMADDCSH)
35229 NODE_NAME_CASE(VFCMADDCSH_RND)
35230 NODE_NAME_CASE(VPMADD52H)
35231 NODE_NAME_CASE(VPMADD52L)
35232 NODE_NAME_CASE(VRNDSCALE)
35233 NODE_NAME_CASE(STRICT_VRNDSCALE)
35234 NODE_NAME_CASE(VRNDSCALE_SAE)
35235 NODE_NAME_CASE(VRNDSCALES)
35236 NODE_NAME_CASE(VRNDSCALES_SAE)
35237 NODE_NAME_CASE(VREDUCE)
35238 NODE_NAME_CASE(VREDUCE_SAE)
35239 NODE_NAME_CASE(VREDUCES)
35240 NODE_NAME_CASE(VREDUCES_SAE)
35241 NODE_NAME_CASE(VGETMANT)
35242 NODE_NAME_CASE(VGETMANT_SAE)
35243 NODE_NAME_CASE(VGETMANTS)
35244 NODE_NAME_CASE(VGETMANTS_SAE)
35245 NODE_NAME_CASE(PCMPESTR)
35246 NODE_NAME_CASE(PCMPISTR)
35248 NODE_NAME_CASE(COMPRESS)
35250 NODE_NAME_CASE(SELECTS)
35251 NODE_NAME_CASE(ADDSUB)
35252 NODE_NAME_CASE(RCP14)
35253 NODE_NAME_CASE(RCP14S)
35254 NODE_NAME_CASE(RSQRT14)
35255 NODE_NAME_CASE(RSQRT14S)
35256 NODE_NAME_CASE(FADD_RND)
35257 NODE_NAME_CASE(FADDS)
35258 NODE_NAME_CASE(FADDS_RND)
35259 NODE_NAME_CASE(FSUB_RND)
35260 NODE_NAME_CASE(FSUBS)
35261 NODE_NAME_CASE(FSUBS_RND)
35262 NODE_NAME_CASE(FMUL_RND)
35263 NODE_NAME_CASE(FMULS)
35264 NODE_NAME_CASE(FMULS_RND)
35265 NODE_NAME_CASE(FDIV_RND)
35266 NODE_NAME_CASE(FDIVS)
35267 NODE_NAME_CASE(FDIVS_RND)
35268 NODE_NAME_CASE(FSQRT_RND)
35269 NODE_NAME_CASE(FSQRTS)
35270 NODE_NAME_CASE(FSQRTS_RND)
35271 NODE_NAME_CASE(FGETEXP)
35272 NODE_NAME_CASE(FGETEXP_SAE)
35273 NODE_NAME_CASE(FGETEXPS)
35274 NODE_NAME_CASE(FGETEXPS_SAE)
35275 NODE_NAME_CASE(SCALEF)
35276 NODE_NAME_CASE(SCALEF_RND)
35277 NODE_NAME_CASE(SCALEFS)
35278 NODE_NAME_CASE(SCALEFS_RND)
35279 NODE_NAME_CASE(MULHRS)
35280 NODE_NAME_CASE(SINT_TO_FP_RND)
35281 NODE_NAME_CASE(UINT_TO_FP_RND)
35282 NODE_NAME_CASE(CVTTP2SI)
35283 NODE_NAME_CASE(CVTTP2UI)
35284 NODE_NAME_CASE(STRICT_CVTTP2SI)
35285 NODE_NAME_CASE(STRICT_CVTTP2UI)
35286 NODE_NAME_CASE(MCVTTP2SI)
35287 NODE_NAME_CASE(MCVTTP2UI)
35288 NODE_NAME_CASE(CVTTP2SI_SAE)
35289 NODE_NAME_CASE(CVTTP2UI_SAE)
35290 NODE_NAME_CASE(CVTTS2SI)
35291 NODE_NAME_CASE(CVTTS2UI)
35292 NODE_NAME_CASE(CVTTS2SI_SAE)
35293 NODE_NAME_CASE(CVTTS2UI_SAE)
35294 NODE_NAME_CASE(CVTSI2P)
35295 NODE_NAME_CASE(CVTUI2P)
35296 NODE_NAME_CASE(STRICT_CVTSI2P)
35297 NODE_NAME_CASE(STRICT_CVTUI2P)
35298 NODE_NAME_CASE(MCVTSI2P)
35299 NODE_NAME_CASE(MCVTUI2P)
35300 NODE_NAME_CASE(VFPCLASS)
35301 NODE_NAME_CASE(VFPCLASSS)
35302 NODE_NAME_CASE(MULTISHIFT)
35303 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
35304 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
35305 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
35306 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
35307 NODE_NAME_CASE(CVTPS2PH)
35308 NODE_NAME_CASE(STRICT_CVTPS2PH)
35309 NODE_NAME_CASE(CVTPS2PH_SAE)
35310 NODE_NAME_CASE(MCVTPS2PH)
35311 NODE_NAME_CASE(MCVTPS2PH_SAE)
35312 NODE_NAME_CASE(CVTPH2PS)
35313 NODE_NAME_CASE(STRICT_CVTPH2PS)
35314 NODE_NAME_CASE(CVTPH2PS_SAE)
35315 NODE_NAME_CASE(CVTP2SI)
35316 NODE_NAME_CASE(CVTP2UI)
35317 NODE_NAME_CASE(MCVTP2SI)
35318 NODE_NAME_CASE(MCVTP2UI)
35319 NODE_NAME_CASE(CVTP2SI_RND)
35320 NODE_NAME_CASE(CVTP2UI_RND)
35321 NODE_NAME_CASE(CVTS2SI)
35322 NODE_NAME_CASE(CVTS2UI)
35323 NODE_NAME_CASE(CVTS2SI_RND)
35324 NODE_NAME_CASE(CVTS2UI_RND)
35325 NODE_NAME_CASE(CVTNEPS2BF16)
35326 NODE_NAME_CASE(MCVTNEPS2BF16)
35327 NODE_NAME_CASE(DPBF16PS)
35328 NODE_NAME_CASE(DPFP16PS)
35329 NODE_NAME_CASE(MPSADBW)
35330 NODE_NAME_CASE(LWPINS)
35331 NODE_NAME_CASE(MGATHER)
35332 NODE_NAME_CASE(MSCATTER)
35333 NODE_NAME_CASE(VPDPBUSD)
35334 NODE_NAME_CASE(VPDPBUSDS)
35335 NODE_NAME_CASE(VPDPWSSD)
35336 NODE_NAME_CASE(VPDPWSSDS)
35337 NODE_NAME_CASE(VPSHUFBITQMB)
35338 NODE_NAME_CASE(GF2P8MULB)
35339 NODE_NAME_CASE(GF2P8AFFINEQB)
35340 NODE_NAME_CASE(GF2P8AFFINEINVQB)
35341 NODE_NAME_CASE(NT_CALL)
35342 NODE_NAME_CASE(NT_BRIND)
35343 NODE_NAME_CASE(UMWAIT)
35344 NODE_NAME_CASE(TPAUSE)
35345 NODE_NAME_CASE(ENQCMD)
35346 NODE_NAME_CASE(ENQCMDS)
35347 NODE_NAME_CASE(VP2INTERSECT)
35348 NODE_NAME_CASE(VPDPBSUD)
35349 NODE_NAME_CASE(VPDPBSUDS)
35350 NODE_NAME_CASE(VPDPBUUD)
35351 NODE_NAME_CASE(VPDPBUUDS)
35352 NODE_NAME_CASE(VPDPBSSD)
35353 NODE_NAME_CASE(VPDPBSSDS)
35354 NODE_NAME_CASE(VPDPWSUD)
35355 NODE_NAME_CASE(VPDPWSUDS)
35356 NODE_NAME_CASE(VPDPWUSD)
35357 NODE_NAME_CASE(VPDPWUSDS)
35358 NODE_NAME_CASE(VPDPWUUD)
35359 NODE_NAME_CASE(VPDPWUUDS)
35360 NODE_NAME_CASE(VMINMAX)
35361 NODE_NAME_CASE(VMINMAX_SAE)
35362 NODE_NAME_CASE(VMINMAXS)
35363 NODE_NAME_CASE(VMINMAXS_SAE)
35364 NODE_NAME_CASE(CVTP2IBS)
35365 NODE_NAME_CASE(CVTP2IUBS)
35366 NODE_NAME_CASE(CVTP2IBS_RND)
35367 NODE_NAME_CASE(CVTP2IUBS_RND)
35368 NODE_NAME_CASE(CVTTP2IBS)
35369 NODE_NAME_CASE(CVTTP2IUBS)
35370 NODE_NAME_CASE(CVTTP2IBS_SAE)
35371 NODE_NAME_CASE(CVTTP2IUBS_SAE)
35372 NODE_NAME_CASE(VCVT2PH2BF8)
35373 NODE_NAME_CASE(VCVT2PH2BF8S)
35374 NODE_NAME_CASE(VCVT2PH2HF8)
35375 NODE_NAME_CASE(VCVT2PH2HF8S)
35376 NODE_NAME_CASE(VCVTBIASPH2BF8)
35377 NODE_NAME_CASE(VCVTBIASPH2BF8S)
35378 NODE_NAME_CASE(VCVTBIASPH2HF8)
35379 NODE_NAME_CASE(VCVTBIASPH2HF8S)
35380 NODE_NAME_CASE(VCVTPH2BF8)
35381 NODE_NAME_CASE(VCVTPH2BF8S)
35382 NODE_NAME_CASE(VCVTPH2HF8)
35383 NODE_NAME_CASE(VCVTPH2HF8S)
35384 NODE_NAME_CASE(VMCVTBIASPH2BF8)
35385 NODE_NAME_CASE(VMCVTBIASPH2BF8S)
35386 NODE_NAME_CASE(VMCVTBIASPH2HF8)
35387 NODE_NAME_CASE(VMCVTBIASPH2HF8S)
35388 NODE_NAME_CASE(VMCVTPH2BF8)
35389 NODE_NAME_CASE(VMCVTPH2BF8S)
35390 NODE_NAME_CASE(VMCVTPH2HF8)
35391 NODE_NAME_CASE(VMCVTPH2HF8S)
35392 NODE_NAME_CASE(VCVTHF82PH)
35393 NODE_NAME_CASE(AESENC128KL)
35394 NODE_NAME_CASE(AESDEC128KL)
35395 NODE_NAME_CASE(AESENC256KL)
35396 NODE_NAME_CASE(AESDEC256KL)
35397 NODE_NAME_CASE(AESENCWIDE128KL)
35398 NODE_NAME_CASE(AESDECWIDE128KL)
35399 NODE_NAME_CASE(AESENCWIDE256KL)
35400 NODE_NAME_CASE(AESDECWIDE256KL)
35401 NODE_NAME_CASE(CMPCCXADD)
35402 NODE_NAME_CASE(TESTUI)
35403 NODE_NAME_CASE(FP80_ADD)
35404 NODE_NAME_CASE(STRICT_FP80_ADD)
35405 NODE_NAME_CASE(CCMP)
35406 NODE_NAME_CASE(CTEST)
35407 NODE_NAME_CASE(CLOAD)
35408 NODE_NAME_CASE(CSTORE)
35409 NODE_NAME_CASE(CVTTS2SIS)
35410 NODE_NAME_CASE(CVTTS2UIS)
35411 NODE_NAME_CASE(CVTTS2SIS_SAE)
35412 NODE_NAME_CASE(CVTTS2UIS_SAE)
35413 NODE_NAME_CASE(CVTTP2SIS)
35414 NODE_NAME_CASE(MCVTTP2SIS)
35415 NODE_NAME_CASE(CVTTP2UIS_SAE)
35416 NODE_NAME_CASE(CVTTP2SIS_SAE)
35417 NODE_NAME_CASE(CVTTP2UIS)
35418 NODE_NAME_CASE(MCVTTP2UIS)
35419 NODE_NAME_CASE(POP_FROM_X87_REG)
35420 }
35421 return nullptr;
35422#undef NODE_NAME_CASE
35423}
35424
35425/// Return true if the addressing mode represented by AM is legal for this
35426/// target, for a load/store of the specified type.
35428 const AddrMode &AM, Type *Ty,
35429 unsigned AS,
35430 Instruction *I) const {
35431 // X86 supports extremely general addressing modes.
35433
35434 // X86 allows a sign-extended 32-bit immediate field as a displacement.
35435 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
35436 return false;
35437
35438 if (AM.BaseGV) {
35439 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
35440
35441 // If a reference to this global requires an extra load, we can't fold it.
35442 if (isGlobalStubReference(GVFlags))
35443 return false;
35444
35445 // If BaseGV requires a register for the PIC base, we cannot also have a
35446 // BaseReg specified.
35447 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
35448 return false;
35449
35450 // If lower 4G is not available, then we must use rip-relative addressing.
35451 if ((M != CodeModel::Small || isPositionIndependent()) &&
35452 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
35453 return false;
35454 }
35455
35456 switch (AM.Scale) {
35457 case 0:
35458 case 1:
35459 case 2:
35460 case 4:
35461 case 8:
35462 // These scales always work.
35463 break;
35464 case 3:
35465 case 5:
35466 case 9:
35467 // These scales are formed with basereg+scalereg. Only accept if there is
35468 // no basereg yet.
35469 if (AM.HasBaseReg)
35470 return false;
35471 break;
35472 default: // Other stuff never works.
35473 return false;
35474 }
35475
35476 return true;
35477}
35478
35479bool X86TargetLowering::isBinOp(unsigned Opcode) const {
35480 switch (Opcode) {
35481 // These are non-commutative binops.
35482 // TODO: Add more X86ISD opcodes once we have test coverage.
35483 case X86ISD::ANDNP:
35484 case X86ISD::PCMPGT:
35485 case X86ISD::FMAX:
35486 case X86ISD::FMIN:
35487 case X86ISD::FANDN:
35488 case X86ISD::VPSHA:
35489 case X86ISD::VPSHL:
35490 case X86ISD::VSHLV:
35491 case X86ISD::VSRLV:
35492 case X86ISD::VSRAV:
35493 return true;
35494 }
35495
35496 return TargetLoweringBase::isBinOp(Opcode);
35497}
35498
35499bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
35500 switch (Opcode) {
35501 // TODO: Add more X86ISD opcodes once we have test coverage.
35502 case X86ISD::PCMPEQ:
35503 case X86ISD::PMULDQ:
35504 case X86ISD::PMULUDQ:
35505 case X86ISD::FMAXC:
35506 case X86ISD::FMINC:
35507 case X86ISD::FAND:
35508 case X86ISD::FOR:
35509 case X86ISD::FXOR:
35510 return true;
35511 }
35512
35514}
35515
35517 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35518 return false;
35519 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
35520 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
35521 return NumBits1 > NumBits2;
35522}
35523
35525 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35526 return false;
35527
35528 if (!isTypeLegal(EVT::getEVT(Ty1)))
35529 return false;
35530
35531 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
35532
35533 // Assuming the caller doesn't have a zeroext or signext return parameter,
35534 // truncation all the way down to i1 is valid.
35535 return true;
35536}
35537
35539 return isInt<32>(Imm);
35540}
35541
35543 // Can also use sub to handle negated immediates.
35544 return isInt<32>(Imm);
35545}
35546
35548 return isInt<32>(Imm);
35549}
35550
35552 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
35553 return false;
35554 unsigned NumBits1 = VT1.getSizeInBits();
35555 unsigned NumBits2 = VT2.getSizeInBits();
35556 return NumBits1 > NumBits2;
35557}
35558
35560 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35561 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
35562}
35563
35565 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35566 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
35567}
35568
35570 EVT VT1 = Val.getValueType();
35571 if (isZExtFree(VT1, VT2))
35572 return true;
35573
35574 if (Val.getOpcode() != ISD::LOAD)
35575 return false;
35576
35577 if (!VT1.isSimple() || !VT1.isInteger() ||
35578 !VT2.isSimple() || !VT2.isInteger())
35579 return false;
35580
35581 switch (VT1.getSimpleVT().SimpleTy) {
35582 default: break;
35583 case MVT::i8:
35584 case MVT::i16:
35585 case MVT::i32:
35586 // X86 has 8, 16, and 32-bit zero-extending loads.
35587 return true;
35588 }
35589
35590 return false;
35591}
35592
35594 if (!Subtarget.is64Bit())
35595 return false;
35596 return TargetLowering::shouldConvertPhiType(From, To);
35597}
35598
35600 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
35601 return false;
35602
35603 EVT SrcVT = ExtVal.getOperand(0).getValueType();
35604
35605 // There is no extending load for vXi1.
35606 if (SrcVT.getScalarType() == MVT::i1)
35607 return false;
35608
35609 return true;
35610}
35611
35613 EVT VT) const {
35614 if (Subtarget.useSoftFloat())
35615 return false;
35616
35617 if (!Subtarget.hasAnyFMA())
35618 return false;
35619
35620 VT = VT.getScalarType();
35621
35622 if (!VT.isSimple())
35623 return false;
35624
35625 switch (VT.getSimpleVT().SimpleTy) {
35626 case MVT::f16:
35627 return Subtarget.hasFP16();
35628 case MVT::f32:
35629 case MVT::f64:
35630 return true;
35631 default:
35632 break;
35633 }
35634
35635 return false;
35636}
35637
35639 EVT DestVT) const {
35640 // i16 instructions are longer (0x66 prefix) and potentially slower.
35641 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
35642}
35643
35645 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
35646 SDValue Y) const {
35647 if (SelectOpcode == ISD::SELECT) {
35648 if (VT.isVector())
35649 return false;
35650 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
35651 return false;
35652 using namespace llvm::SDPatternMatch;
35653 // BLSI
35654 if (BinOpcode == ISD::AND && (sd_match(Y, m_Neg(m_Specific(X))) ||
35656 return true;
35657 // BLSR
35658 if (BinOpcode == ISD::AND &&
35661 return true;
35662 // BLSMSK
35663 if (BinOpcode == ISD::XOR &&
35666 return true;
35667
35668 return false;
35669 }
35670 // TODO: This is too general. There are cases where pre-AVX512 codegen would
35671 // benefit. The transform may also be profitable for scalar code.
35672 if (!Subtarget.hasAVX512())
35673 return false;
35674 if (!Subtarget.hasVLX() && !VT.is512BitVector())
35675 return false;
35676 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
35677 return false;
35678
35679 return true;
35680}
35681
35682/// Targets can use this to indicate that they only support *some*
35683/// VECTOR_SHUFFLE operations, those with specific masks.
35684/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
35685/// are assumed to be legal.
35687 if (!VT.isSimple())
35688 return false;
35689
35690 // Not for i1 vectors
35691 if (VT.getSimpleVT().getScalarType() == MVT::i1)
35692 return false;
35693
35694 // Very little shuffling can be done for 64-bit vectors right now.
35695 if (VT.getSimpleVT().getSizeInBits() == 64)
35696 return false;
35697
35698 // We only care that the types being shuffled are legal. The lowering can
35699 // handle any possible shuffle mask that results.
35700 return isTypeLegal(VT.getSimpleVT());
35701}
35702
35704 EVT VT) const {
35705 // Don't convert an 'and' into a shuffle that we don't directly support.
35706 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
35707 if (!Subtarget.hasAVX2())
35708 if (VT == MVT::v32i8 || VT == MVT::v16i16)
35709 return false;
35710
35711 // Just delegate to the generic legality, clear masks aren't special.
35712 return isShuffleMaskLegal(Mask, VT);
35713}
35714
35716 // If the subtarget is using thunks, we need to not generate jump tables.
35717 if (Subtarget.useIndirectThunkBranches())
35718 return false;
35719
35720 // Otherwise, fallback on the generic logic.
35722}
35723
35725 EVT ConditionVT) const {
35726 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
35727 // zero-extensions.
35728 if (ConditionVT.getSizeInBits() < 32)
35729 return MVT::i32;
35731 ConditionVT);
35732}
35733
35734//===----------------------------------------------------------------------===//
35735// X86 Scheduler Hooks
35736//===----------------------------------------------------------------------===//
35737
35738/// Utility function to emit xbegin specifying the start of an RTM region.
35740 const TargetInstrInfo *TII) {
35741 const MIMetadata MIMD(MI);
35742
35743 const BasicBlock *BB = MBB->getBasicBlock();
35744 MachineFunction::iterator I = ++MBB->getIterator();
35745
35746 // For the v = xbegin(), we generate
35747 //
35748 // thisMBB:
35749 // xbegin sinkMBB
35750 //
35751 // mainMBB:
35752 // s0 = -1
35753 //
35754 // fallBB:
35755 // eax = # XABORT_DEF
35756 // s1 = eax
35757 //
35758 // sinkMBB:
35759 // v = phi(s0/mainBB, s1/fallBB)
35760
35761 MachineBasicBlock *thisMBB = MBB;
35762 MachineFunction *MF = MBB->getParent();
35763 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35764 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35765 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35766 MF->insert(I, mainMBB);
35767 MF->insert(I, fallMBB);
35768 MF->insert(I, sinkMBB);
35769
35770 if (isPhysRegUsedAfter(X86::EFLAGS, MI)) {
35771 mainMBB->addLiveIn(X86::EFLAGS);
35772 fallMBB->addLiveIn(X86::EFLAGS);
35773 sinkMBB->addLiveIn(X86::EFLAGS);
35774 }
35775
35776 // Transfer the remainder of BB and its successor edges to sinkMBB.
35777 sinkMBB->splice(sinkMBB->begin(), MBB,
35778 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35780
35782 Register DstReg = MI.getOperand(0).getReg();
35783 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35784 Register mainDstReg = MRI.createVirtualRegister(RC);
35785 Register fallDstReg = MRI.createVirtualRegister(RC);
35786
35787 // thisMBB:
35788 // xbegin fallMBB
35789 // # fallthrough to mainMBB
35790 // # abortion to fallMBB
35791 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
35792 thisMBB->addSuccessor(mainMBB);
35793 thisMBB->addSuccessor(fallMBB);
35794
35795 // mainMBB:
35796 // mainDstReg := -1
35797 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
35798 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35799 mainMBB->addSuccessor(sinkMBB);
35800
35801 // fallMBB:
35802 // ; pseudo instruction to model hardware's definition from XABORT
35803 // EAX := XABORT_DEF
35804 // fallDstReg := EAX
35805 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
35806 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
35807 .addReg(X86::EAX);
35808 fallMBB->addSuccessor(sinkMBB);
35809
35810 // sinkMBB:
35811 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
35812 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35813 .addReg(mainDstReg).addMBB(mainMBB)
35814 .addReg(fallDstReg).addMBB(fallMBB);
35815
35816 MI.eraseFromParent();
35817 return sinkMBB;
35818}
35819
35821X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
35822 MachineBasicBlock *MBB) const {
35823 // Emit va_arg instruction on X86-64.
35824
35825 // Operands to this pseudo-instruction:
35826 // 0 ) Output : destination address (reg)
35827 // 1-5) Input : va_list address (addr, i64mem)
35828 // 6 ) ArgSize : Size (in bytes) of vararg type
35829 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
35830 // 8 ) Align : Alignment of type
35831 // 9 ) EFLAGS (implicit-def)
35832
35833 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
35834 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
35835
35836 Register DestReg = MI.getOperand(0).getReg();
35837 MachineOperand &Base = MI.getOperand(1);
35838 MachineOperand &Scale = MI.getOperand(2);
35839 MachineOperand &Index = MI.getOperand(3);
35840 MachineOperand &Disp = MI.getOperand(4);
35841 MachineOperand &Segment = MI.getOperand(5);
35842 unsigned ArgSize = MI.getOperand(6).getImm();
35843 unsigned ArgMode = MI.getOperand(7).getImm();
35844 Align Alignment = Align(MI.getOperand(8).getImm());
35845
35846 MachineFunction *MF = MBB->getParent();
35847
35848 // Memory Reference
35849 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
35850
35851 MachineMemOperand *OldMMO = MI.memoperands().front();
35852
35853 // Clone the MMO into two separate MMOs for loading and storing
35854 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
35855 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
35856 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
35857 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
35858
35859 // Machine Information
35860 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35861 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
35862 const TargetRegisterClass *AddrRegClass =
35864 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
35865 const MIMetadata MIMD(MI);
35866
35867 // struct va_list {
35868 // i32 gp_offset
35869 // i32 fp_offset
35870 // i64 overflow_area (address)
35871 // i64 reg_save_area (address)
35872 // }
35873 // sizeof(va_list) = 24
35874 // alignment(va_list) = 8
35875
35876 unsigned TotalNumIntRegs = 6;
35877 unsigned TotalNumXMMRegs = 8;
35878 bool UseGPOffset = (ArgMode == 1);
35879 bool UseFPOffset = (ArgMode == 2);
35880 unsigned MaxOffset = TotalNumIntRegs * 8 +
35881 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
35882
35883 /* Align ArgSize to a multiple of 8 */
35884 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
35885 bool NeedsAlign = (Alignment > 8);
35886
35887 MachineBasicBlock *thisMBB = MBB;
35888 MachineBasicBlock *overflowMBB;
35889 MachineBasicBlock *offsetMBB;
35890 MachineBasicBlock *endMBB;
35891
35892 Register OffsetDestReg; // Argument address computed by offsetMBB
35893 Register OverflowDestReg; // Argument address computed by overflowMBB
35894 Register OffsetReg;
35895
35896 if (!UseGPOffset && !UseFPOffset) {
35897 // If we only pull from the overflow region, we don't create a branch.
35898 // We don't need to alter control flow.
35899 OffsetDestReg = Register(); // unused
35900 OverflowDestReg = DestReg;
35901
35902 offsetMBB = nullptr;
35903 overflowMBB = thisMBB;
35904 endMBB = thisMBB;
35905 } else {
35906 // First emit code to check if gp_offset (or fp_offset) is below the bound.
35907 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
35908 // If not, pull from overflow_area. (branch to overflowMBB)
35909 //
35910 // thisMBB
35911 // | .
35912 // | .
35913 // offsetMBB overflowMBB
35914 // | .
35915 // | .
35916 // endMBB
35917
35918 // Registers for the PHI in endMBB
35919 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
35920 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
35921
35922 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35923 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35924 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35925 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35926
35928
35929 // Insert the new basic blocks
35930 MF->insert(MBBIter, offsetMBB);
35931 MF->insert(MBBIter, overflowMBB);
35932 MF->insert(MBBIter, endMBB);
35933
35934 // Transfer the remainder of MBB and its successor edges to endMBB.
35935 endMBB->splice(endMBB->begin(), thisMBB,
35936 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
35937 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
35938
35939 // Make offsetMBB and overflowMBB successors of thisMBB
35940 thisMBB->addSuccessor(offsetMBB);
35941 thisMBB->addSuccessor(overflowMBB);
35942
35943 // endMBB is a successor of both offsetMBB and overflowMBB
35944 offsetMBB->addSuccessor(endMBB);
35945 overflowMBB->addSuccessor(endMBB);
35946
35947 // Load the offset value into a register
35948 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35949 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
35950 .add(Base)
35951 .add(Scale)
35952 .add(Index)
35953 .addDisp(Disp, UseFPOffset ? 4 : 0)
35954 .add(Segment)
35955 .setMemRefs(LoadOnlyMMO);
35956
35957 // Check if there is enough room left to pull this argument.
35958 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
35959 .addReg(OffsetReg)
35960 .addImm(MaxOffset + 8 - ArgSizeA8);
35961
35962 // Branch to "overflowMBB" if offset >= max
35963 // Fall through to "offsetMBB" otherwise
35964 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
35965 .addMBB(overflowMBB).addImm(X86::COND_AE);
35966 }
35967
35968 // In offsetMBB, emit code to use the reg_save_area.
35969 if (offsetMBB) {
35970 assert(OffsetReg != 0);
35971
35972 // Read the reg_save_area address.
35973 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
35974 BuildMI(
35975 offsetMBB, MIMD,
35976 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35977 RegSaveReg)
35978 .add(Base)
35979 .add(Scale)
35980 .add(Index)
35981 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
35982 .add(Segment)
35983 .setMemRefs(LoadOnlyMMO);
35984
35985 if (Subtarget.isTarget64BitLP64()) {
35986 // Zero-extend the offset
35987 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
35988 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
35989 .addImm(0)
35990 .addReg(OffsetReg)
35991 .addImm(X86::sub_32bit);
35992
35993 // Add the offset to the reg_save_area to get the final address.
35994 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
35995 .addReg(OffsetReg64)
35996 .addReg(RegSaveReg);
35997 } else {
35998 // Add the offset to the reg_save_area to get the final address.
35999 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
36000 .addReg(OffsetReg)
36001 .addReg(RegSaveReg);
36002 }
36003
36004 // Compute the offset for the next argument
36005 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
36006 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
36007 .addReg(OffsetReg)
36008 .addImm(UseFPOffset ? 16 : 8);
36009
36010 // Store it back into the va_list.
36011 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
36012 .add(Base)
36013 .add(Scale)
36014 .add(Index)
36015 .addDisp(Disp, UseFPOffset ? 4 : 0)
36016 .add(Segment)
36017 .addReg(NextOffsetReg)
36018 .setMemRefs(StoreOnlyMMO);
36019
36020 // Jump to endMBB
36021 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
36022 .addMBB(endMBB);
36023 }
36024
36025 //
36026 // Emit code to use overflow area
36027 //
36028
36029 // Load the overflow_area address into a register.
36030 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
36031 BuildMI(overflowMBB, MIMD,
36032 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
36033 OverflowAddrReg)
36034 .add(Base)
36035 .add(Scale)
36036 .add(Index)
36037 .addDisp(Disp, 8)
36038 .add(Segment)
36039 .setMemRefs(LoadOnlyMMO);
36040
36041 // If we need to align it, do so. Otherwise, just copy the address
36042 // to OverflowDestReg.
36043 if (NeedsAlign) {
36044 // Align the overflow address
36045 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
36046
36047 // aligned_addr = (addr + (align-1)) & ~(align-1)
36048 BuildMI(
36049 overflowMBB, MIMD,
36050 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36051 TmpReg)
36052 .addReg(OverflowAddrReg)
36053 .addImm(Alignment.value() - 1);
36054
36055 BuildMI(
36056 overflowMBB, MIMD,
36057 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
36058 OverflowDestReg)
36059 .addReg(TmpReg)
36060 .addImm(~(uint64_t)(Alignment.value() - 1));
36061 } else {
36062 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
36063 .addReg(OverflowAddrReg);
36064 }
36065
36066 // Compute the next overflow address after this argument.
36067 // (the overflow address should be kept 8-byte aligned)
36068 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
36069 BuildMI(
36070 overflowMBB, MIMD,
36071 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36072 NextAddrReg)
36073 .addReg(OverflowDestReg)
36074 .addImm(ArgSizeA8);
36075
36076 // Store the new overflow address.
36077 BuildMI(overflowMBB, MIMD,
36078 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
36079 .add(Base)
36080 .add(Scale)
36081 .add(Index)
36082 .addDisp(Disp, 8)
36083 .add(Segment)
36084 .addReg(NextAddrReg)
36085 .setMemRefs(StoreOnlyMMO);
36086
36087 // If we branched, emit the PHI to the front of endMBB.
36088 if (offsetMBB) {
36089 BuildMI(*endMBB, endMBB->begin(), MIMD,
36090 TII->get(X86::PHI), DestReg)
36091 .addReg(OffsetDestReg).addMBB(offsetMBB)
36092 .addReg(OverflowDestReg).addMBB(overflowMBB);
36093 }
36094
36095 // Erase the pseudo instruction
36096 MI.eraseFromParent();
36097
36098 return endMBB;
36099}
36100
36101// The EFLAGS operand of SelectItr might be missing a kill marker
36102// because there were multiple uses of EFLAGS, and ISel didn't know
36103// which to mark. Figure out whether SelectItr should have had a
36104// kill marker, and set it if it should. Returns the correct kill
36105// marker value.
36108 const TargetRegisterInfo* TRI) {
36109 if (isPhysRegUsedAfter(X86::EFLAGS, SelectItr))
36110 return false;
36111
36112 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
36113 // out. SelectMI should have a kill flag on EFLAGS.
36114 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
36115 return true;
36116}
36117
36118// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
36119// together with other CMOV pseudo-opcodes into a single basic-block with
36120// conditional jump around it.
36122 switch (MI.getOpcode()) {
36123 case X86::CMOV_FR16:
36124 case X86::CMOV_FR16X:
36125 case X86::CMOV_FR32:
36126 case X86::CMOV_FR32X:
36127 case X86::CMOV_FR64:
36128 case X86::CMOV_FR64X:
36129 case X86::CMOV_GR8:
36130 case X86::CMOV_GR16:
36131 case X86::CMOV_GR32:
36132 case X86::CMOV_RFP32:
36133 case X86::CMOV_RFP64:
36134 case X86::CMOV_RFP80:
36135 case X86::CMOV_VR64:
36136 case X86::CMOV_VR128:
36137 case X86::CMOV_VR128X:
36138 case X86::CMOV_VR256:
36139 case X86::CMOV_VR256X:
36140 case X86::CMOV_VR512:
36141 case X86::CMOV_VK1:
36142 case X86::CMOV_VK2:
36143 case X86::CMOV_VK4:
36144 case X86::CMOV_VK8:
36145 case X86::CMOV_VK16:
36146 case X86::CMOV_VK32:
36147 case X86::CMOV_VK64:
36148 return true;
36149
36150 default:
36151 return false;
36152 }
36153}
36154
36155// Helper function, which inserts PHI functions into SinkMBB:
36156// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
36157// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
36158// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
36159// the last PHI function inserted.
36162 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
36163 MachineBasicBlock *SinkMBB) {
36164 MachineFunction *MF = TrueMBB->getParent();
36166 const MIMetadata MIMD(*MIItBegin);
36167
36168 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
36170
36171 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
36172
36173 // As we are creating the PHIs, we have to be careful if there is more than
36174 // one. Later CMOVs may reference the results of earlier CMOVs, but later
36175 // PHIs have to reference the individual true/false inputs from earlier PHIs.
36176 // That also means that PHI construction must work forward from earlier to
36177 // later, and that the code must maintain a mapping from earlier PHI's
36178 // destination registers, and the registers that went into the PHI.
36181
36182 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
36183 Register DestReg = MIIt->getOperand(0).getReg();
36184 Register Op1Reg = MIIt->getOperand(1).getReg();
36185 Register Op2Reg = MIIt->getOperand(2).getReg();
36186
36187 // If this CMOV we are generating is the opposite condition from
36188 // the jump we generated, then we have to swap the operands for the
36189 // PHI that is going to be generated.
36190 if (MIIt->getOperand(3).getImm() == OppCC)
36191 std::swap(Op1Reg, Op2Reg);
36192
36193 if (auto It = RegRewriteTable.find(Op1Reg); It != RegRewriteTable.end())
36194 Op1Reg = It->second.first;
36195
36196 if (auto It = RegRewriteTable.find(Op2Reg); It != RegRewriteTable.end())
36197 Op2Reg = It->second.second;
36198
36199 MIB =
36200 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
36201 .addReg(Op1Reg)
36202 .addMBB(FalseMBB)
36203 .addReg(Op2Reg)
36204 .addMBB(TrueMBB);
36205
36206 // Add this PHI to the rewrite table.
36207 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
36208 }
36209
36210 return MIB;
36211}
36212
36213// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
36215X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
36216 MachineInstr &SecondCascadedCMOV,
36217 MachineBasicBlock *ThisMBB) const {
36218 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36219 const MIMetadata MIMD(FirstCMOV);
36220
36221 // We lower cascaded CMOVs such as
36222 //
36223 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
36224 //
36225 // to two successive branches.
36226 //
36227 // Without this, we would add a PHI between the two jumps, which ends up
36228 // creating a few copies all around. For instance, for
36229 //
36230 // (sitofp (zext (fcmp une)))
36231 //
36232 // we would generate:
36233 //
36234 // ucomiss %xmm1, %xmm0
36235 // movss <1.0f>, %xmm0
36236 // movaps %xmm0, %xmm1
36237 // jne .LBB5_2
36238 // xorps %xmm1, %xmm1
36239 // .LBB5_2:
36240 // jp .LBB5_4
36241 // movaps %xmm1, %xmm0
36242 // .LBB5_4:
36243 // retq
36244 //
36245 // because this custom-inserter would have generated:
36246 //
36247 // A
36248 // | \
36249 // | B
36250 // | /
36251 // C
36252 // | \
36253 // | D
36254 // | /
36255 // E
36256 //
36257 // A: X = ...; Y = ...
36258 // B: empty
36259 // C: Z = PHI [X, A], [Y, B]
36260 // D: empty
36261 // E: PHI [X, C], [Z, D]
36262 //
36263 // If we lower both CMOVs in a single step, we can instead generate:
36264 //
36265 // A
36266 // | \
36267 // | C
36268 // | /|
36269 // |/ |
36270 // | |
36271 // | D
36272 // | /
36273 // E
36274 //
36275 // A: X = ...; Y = ...
36276 // D: empty
36277 // E: PHI [X, A], [X, C], [Y, D]
36278 //
36279 // Which, in our sitofp/fcmp example, gives us something like:
36280 //
36281 // ucomiss %xmm1, %xmm0
36282 // movss <1.0f>, %xmm0
36283 // jne .LBB5_4
36284 // jp .LBB5_4
36285 // xorps %xmm0, %xmm0
36286 // .LBB5_4:
36287 // retq
36288 //
36289
36290 // We lower cascaded CMOV into two successive branches to the same block.
36291 // EFLAGS is used by both, so mark it as live in the second.
36292 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36293 MachineFunction *F = ThisMBB->getParent();
36294 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36295 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36296 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36297
36298 MachineFunction::iterator It = ++ThisMBB->getIterator();
36299 F->insert(It, FirstInsertedMBB);
36300 F->insert(It, SecondInsertedMBB);
36301 F->insert(It, SinkMBB);
36302
36303 // For a cascaded CMOV, we lower it to two successive branches to
36304 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
36305 // the FirstInsertedMBB.
36306 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
36307
36308 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36309 // live into the sink and copy blocks.
36310 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36311 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36312 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
36313 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
36314 SinkMBB->addLiveIn(X86::EFLAGS);
36315 }
36316
36317 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36318 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
36319 std::next(MachineBasicBlock::iterator(FirstCMOV)),
36320 ThisMBB->end());
36321 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36322
36323 // Fallthrough block for ThisMBB.
36324 ThisMBB->addSuccessor(FirstInsertedMBB);
36325 // The true block target of the first branch is always SinkMBB.
36326 ThisMBB->addSuccessor(SinkMBB);
36327 // Fallthrough block for FirstInsertedMBB.
36328 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
36329 // The true block for the branch of FirstInsertedMBB.
36330 FirstInsertedMBB->addSuccessor(SinkMBB);
36331 // This is fallthrough.
36332 SecondInsertedMBB->addSuccessor(SinkMBB);
36333
36334 // Create the conditional branch instructions.
36335 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
36336 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
36337
36338 X86::CondCode SecondCC =
36339 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
36340 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
36341 .addMBB(SinkMBB)
36342 .addImm(SecondCC);
36343
36344 // SinkMBB:
36345 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
36346 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
36347 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
36348 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
36349 MachineInstrBuilder MIB =
36350 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
36351 .addReg(Op1Reg)
36352 .addMBB(SecondInsertedMBB)
36353 .addReg(Op2Reg)
36354 .addMBB(ThisMBB);
36355
36356 // The second SecondInsertedMBB provides the same incoming value as the
36357 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
36358 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
36359
36360 // Now remove the CMOVs.
36361 FirstCMOV.eraseFromParent();
36362 SecondCascadedCMOV.eraseFromParent();
36363
36364 return SinkMBB;
36365}
36366
36368X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
36369 MachineBasicBlock *ThisMBB) const {
36370 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36371 const MIMetadata MIMD(MI);
36372
36373 // To "insert" a SELECT_CC instruction, we actually have to insert the
36374 // diamond control-flow pattern. The incoming instruction knows the
36375 // destination vreg to set, the condition code register to branch on, the
36376 // true/false values to select between and a branch opcode to use.
36377
36378 // ThisMBB:
36379 // ...
36380 // TrueVal = ...
36381 // cmpTY ccX, r1, r2
36382 // bCC copy1MBB
36383 // fallthrough --> FalseMBB
36384
36385 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
36386 // as described above, by inserting a BB, and then making a PHI at the join
36387 // point to select the true and false operands of the CMOV in the PHI.
36388 //
36389 // The code also handles two different cases of multiple CMOV opcodes
36390 // in a row.
36391 //
36392 // Case 1:
36393 // In this case, there are multiple CMOVs in a row, all which are based on
36394 // the same condition setting (or the exact opposite condition setting).
36395 // In this case we can lower all the CMOVs using a single inserted BB, and
36396 // then make a number of PHIs at the join point to model the CMOVs. The only
36397 // trickiness here, is that in a case like:
36398 //
36399 // t2 = CMOV cond1 t1, f1
36400 // t3 = CMOV cond1 t2, f2
36401 //
36402 // when rewriting this into PHIs, we have to perform some renaming on the
36403 // temps since you cannot have a PHI operand refer to a PHI result earlier
36404 // in the same block. The "simple" but wrong lowering would be:
36405 //
36406 // t2 = PHI t1(BB1), f1(BB2)
36407 // t3 = PHI t2(BB1), f2(BB2)
36408 //
36409 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
36410 // renaming is to note that on the path through BB1, t2 is really just a
36411 // copy of t1, and do that renaming, properly generating:
36412 //
36413 // t2 = PHI t1(BB1), f1(BB2)
36414 // t3 = PHI t1(BB1), f2(BB2)
36415 //
36416 // Case 2:
36417 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
36418 // function - EmitLoweredCascadedSelect.
36419
36420 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
36422 MachineInstr *LastCMOV = &MI;
36424
36425 // Check for case 1, where there are multiple CMOVs with the same condition
36426 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
36427 // number of jumps the most.
36428
36429 if (isCMOVPseudo(MI)) {
36430 // See if we have a string of CMOVS with the same condition. Skip over
36431 // intervening debug insts.
36432 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
36433 (NextMIIt->getOperand(3).getImm() == CC ||
36434 NextMIIt->getOperand(3).getImm() == OppCC)) {
36435 LastCMOV = &*NextMIIt;
36436 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
36437 }
36438 }
36439
36440 // This checks for case 2, but only do this if we didn't already find
36441 // case 1, as indicated by LastCMOV == MI.
36442 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
36443 NextMIIt->getOpcode() == MI.getOpcode() &&
36444 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
36445 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
36446 NextMIIt->getOperand(1).isKill()) {
36447 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
36448 }
36449
36450 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36451 MachineFunction *F = ThisMBB->getParent();
36452 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
36453 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36454
36455 MachineFunction::iterator It = ++ThisMBB->getIterator();
36456 F->insert(It, FalseMBB);
36457 F->insert(It, SinkMBB);
36458
36459 // Set the call frame size on entry to the new basic blocks.
36460 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
36461 FalseMBB->setCallFrameSize(CallFrameSize);
36462 SinkMBB->setCallFrameSize(CallFrameSize);
36463
36464 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36465 // live into the sink and copy blocks.
36466 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36467 if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36468 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
36469 FalseMBB->addLiveIn(X86::EFLAGS);
36470 SinkMBB->addLiveIn(X86::EFLAGS);
36471 }
36472
36473 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
36475 MachineBasicBlock::iterator(LastCMOV));
36476 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
36477 if (MI.isDebugInstr())
36478 SinkMBB->push_back(MI.removeFromParent());
36479
36480 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36481 SinkMBB->splice(SinkMBB->end(), ThisMBB,
36482 std::next(MachineBasicBlock::iterator(LastCMOV)),
36483 ThisMBB->end());
36484 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36485
36486 // Fallthrough block for ThisMBB.
36487 ThisMBB->addSuccessor(FalseMBB);
36488 // The true block target of the first (or only) branch is always a SinkMBB.
36489 ThisMBB->addSuccessor(SinkMBB);
36490 // Fallthrough block for FalseMBB.
36491 FalseMBB->addSuccessor(SinkMBB);
36492
36493 // Create the conditional branch instruction.
36494 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
36495
36496 // SinkMBB:
36497 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
36498 // ...
36501 std::next(MachineBasicBlock::iterator(LastCMOV));
36502 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
36503
36504 // Now remove the CMOV(s).
36505 ThisMBB->erase(MIItBegin, MIItEnd);
36506
36507 return SinkMBB;
36508}
36509
36510static unsigned getSUBriOpcode(bool IsLP64) {
36511 if (IsLP64)
36512 return X86::SUB64ri32;
36513 else
36514 return X86::SUB32ri;
36515}
36516
36518X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
36519 MachineBasicBlock *MBB) const {
36520 MachineFunction *MF = MBB->getParent();
36521 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36522 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
36523 const MIMetadata MIMD(MI);
36524 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36525
36526 const unsigned ProbeSize = getStackProbeSize(*MF);
36527
36528 MachineRegisterInfo &MRI = MF->getRegInfo();
36529 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36530 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36531 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36532
36534 MF->insert(MBBIter, testMBB);
36535 MF->insert(MBBIter, blockMBB);
36536 MF->insert(MBBIter, tailMBB);
36537
36538 Register sizeVReg = MI.getOperand(1).getReg();
36539
36540 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
36541
36542 Register TmpStackPtr = MRI.createVirtualRegister(
36543 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36544 Register FinalStackPtr = MRI.createVirtualRegister(
36545 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36546
36547 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
36548 .addReg(physSPReg);
36549 {
36550 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
36551 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
36552 .addReg(TmpStackPtr)
36553 .addReg(sizeVReg);
36554 }
36555
36556 // test rsp size
36557
36558 BuildMI(testMBB, MIMD,
36559 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
36560 .addReg(FinalStackPtr)
36561 .addReg(physSPReg);
36562
36563 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
36564 .addMBB(tailMBB)
36566 testMBB->addSuccessor(blockMBB);
36567 testMBB->addSuccessor(tailMBB);
36568
36569 // Touch the block then extend it. This is done on the opposite side of
36570 // static probe where we allocate then touch, to avoid the need of probing the
36571 // tail of the static alloca. Possible scenarios are:
36572 //
36573 // + ---- <- ------------ <- ------------- <- ------------ +
36574 // | |
36575 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
36576 // | |
36577 // + <- ----------- <- ------------ <- ----------- <- ------------ +
36578 //
36579 // The property we want to enforce is to never have more than [page alloc] between two probes.
36580
36581 const unsigned XORMIOpc =
36582 TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
36583 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
36584 .addImm(0);
36585
36586 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
36587 physSPReg)
36588 .addReg(physSPReg)
36589 .addImm(ProbeSize);
36590
36591 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
36592 blockMBB->addSuccessor(testMBB);
36593
36594 // Replace original instruction by the expected stack ptr
36595 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
36596 MI.getOperand(0).getReg())
36597 .addReg(FinalStackPtr);
36598
36599 tailMBB->splice(tailMBB->end(), MBB,
36600 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36602 MBB->addSuccessor(testMBB);
36603
36604 // Delete the original pseudo instruction.
36605 MI.eraseFromParent();
36606
36607 // And we're done.
36608 return tailMBB;
36609}
36610
36612X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
36613 MachineBasicBlock *BB) const {
36614 MachineFunction *MF = BB->getParent();
36615 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36616 const MIMetadata MIMD(MI);
36617 const BasicBlock *LLVM_BB = BB->getBasicBlock();
36618
36619 assert(MF->shouldSplitStack());
36620
36621 const bool Is64Bit = Subtarget.is64Bit();
36622 const bool IsLP64 = Subtarget.isTarget64BitLP64();
36623
36624 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
36625 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
36626
36627 // BB:
36628 // ... [Till the alloca]
36629 // If stacklet is not large enough, jump to mallocMBB
36630 //
36631 // bumpMBB:
36632 // Allocate by subtracting from RSP
36633 // Jump to continueMBB
36634 //
36635 // mallocMBB:
36636 // Allocate by call to runtime
36637 //
36638 // continueMBB:
36639 // ...
36640 // [rest of original BB]
36641 //
36642
36643 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36644 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36645 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36646
36647 MachineRegisterInfo &MRI = MF->getRegInfo();
36648 const TargetRegisterClass *AddrRegClass =
36650
36651 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36652 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36653 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
36654 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
36655 sizeVReg = MI.getOperand(1).getReg(),
36656 physSPReg = IsLP64 ? X86::RSP : X86::ESP;
36657
36658 MachineFunction::iterator MBBIter = ++BB->getIterator();
36659
36660 MF->insert(MBBIter, bumpMBB);
36661 MF->insert(MBBIter, mallocMBB);
36662 MF->insert(MBBIter, continueMBB);
36663
36664 continueMBB->splice(continueMBB->begin(), BB,
36665 std::next(MachineBasicBlock::iterator(MI)), BB->end());
36666 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
36667
36668 // Add code to the main basic block to check if the stack limit has been hit,
36669 // and if so, jump to mallocMBB otherwise to bumpMBB.
36670 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
36671 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
36672 .addReg(tmpSPVReg).addReg(sizeVReg);
36673 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
36674 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
36675 .addReg(SPLimitVReg);
36676 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
36677
36678 // bumpMBB simply decreases the stack pointer, since we know the current
36679 // stacklet has enough space.
36680 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
36681 .addReg(SPLimitVReg);
36682 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
36683 .addReg(SPLimitVReg);
36684 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36685
36686 // Calls into a routine in libgcc to allocate more space from the heap.
36687 const uint32_t *RegMask =
36688 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
36689 if (IsLP64) {
36690 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
36691 .addReg(sizeVReg);
36692 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36693 .addExternalSymbol("__morestack_allocate_stack_space")
36694 .addRegMask(RegMask)
36695 .addReg(X86::RDI, RegState::Implicit)
36696 .addReg(X86::RAX, RegState::ImplicitDefine);
36697 } else if (Is64Bit) {
36698 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
36699 .addReg(sizeVReg);
36700 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36701 .addExternalSymbol("__morestack_allocate_stack_space")
36702 .addRegMask(RegMask)
36703 .addReg(X86::EDI, RegState::Implicit)
36704 .addReg(X86::EAX, RegState::ImplicitDefine);
36705 } else {
36706 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
36707 .addImm(12);
36708 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
36709 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
36710 .addExternalSymbol("__morestack_allocate_stack_space")
36711 .addRegMask(RegMask)
36712 .addReg(X86::EAX, RegState::ImplicitDefine);
36713 }
36714
36715 if (!Is64Bit)
36716 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
36717 .addImm(16);
36718
36719 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
36720 .addReg(IsLP64 ? X86::RAX : X86::EAX);
36721 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36722
36723 // Set up the CFG correctly.
36724 BB->addSuccessor(bumpMBB);
36725 BB->addSuccessor(mallocMBB);
36726 mallocMBB->addSuccessor(continueMBB);
36727 bumpMBB->addSuccessor(continueMBB);
36728
36729 // Take care of the PHI nodes.
36730 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
36731 MI.getOperand(0).getReg())
36732 .addReg(mallocPtrVReg)
36733 .addMBB(mallocMBB)
36734 .addReg(bumpSPPtrVReg)
36735 .addMBB(bumpMBB);
36736
36737 // Delete the original pseudo instruction.
36738 MI.eraseFromParent();
36739
36740 // And we're done.
36741 return continueMBB;
36742}
36743
36745X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
36746 MachineBasicBlock *BB) const {
36747 MachineFunction *MF = BB->getParent();
36748 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
36749 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
36750 const MIMetadata MIMD(MI);
36751
36754 "SEH does not use catchret!");
36755
36756 // Only 32-bit EH needs to worry about manually restoring stack pointers.
36757 if (!Subtarget.is32Bit())
36758 return BB;
36759
36760 // C++ EH creates a new target block to hold the restore code, and wires up
36761 // the new block to the return destination with a normal JMP_4.
36762 MachineBasicBlock *RestoreMBB =
36764 assert(BB->succ_size() == 1);
36765 MF->insert(std::next(BB->getIterator()), RestoreMBB);
36766 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
36767 BB->addSuccessor(RestoreMBB);
36768 MI.getOperand(0).setMBB(RestoreMBB);
36769
36770 // Marking this as an EH pad but not a funclet entry block causes PEI to
36771 // restore stack pointers in the block.
36772 RestoreMBB->setIsEHPad(true);
36773
36774 auto RestoreMBBI = RestoreMBB->begin();
36775 BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
36776 return BB;
36777}
36778
36780X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
36781 MachineBasicBlock *BB) const {
36782 // This is pretty easy. We're taking the value that we received from
36783 // our load from the relocation, sticking it in either RDI (x86-64)
36784 // or EAX and doing an indirect call. The return value will then
36785 // be in the normal return register.
36786 MachineFunction *F = BB->getParent();
36787 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36788 const MIMetadata MIMD(MI);
36789
36790 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
36791 assert(MI.getOperand(3).isGlobal() && "This should be a global");
36792
36793 // Get a register mask for the lowered call.
36794 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
36795 // proper register mask.
36796 const uint32_t *RegMask =
36797 Subtarget.is64Bit() ?
36798 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
36799 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
36800 if (Subtarget.is64Bit()) {
36801 MachineInstrBuilder MIB =
36802 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
36803 .addReg(X86::RIP)
36804 .addImm(0)
36805 .addReg(0)
36806 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36807 MI.getOperand(3).getTargetFlags())
36808 .addReg(0);
36809 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
36810 addDirectMem(MIB, X86::RDI);
36811 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
36812 } else if (!isPositionIndependent()) {
36813 MachineInstrBuilder MIB =
36814 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36815 .addReg(0)
36816 .addImm(0)
36817 .addReg(0)
36818 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36819 MI.getOperand(3).getTargetFlags())
36820 .addReg(0);
36821 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36822 addDirectMem(MIB, X86::EAX);
36823 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36824 } else {
36825 MachineInstrBuilder MIB =
36826 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36827 .addReg(TII->getGlobalBaseReg(F))
36828 .addImm(0)
36829 .addReg(0)
36830 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36831 MI.getOperand(3).getTargetFlags())
36832 .addReg(0);
36833 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36834 addDirectMem(MIB, X86::EAX);
36835 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36836 }
36837
36838 MI.eraseFromParent(); // The pseudo instruction is gone now.
36839 return BB;
36840}
36841
36842static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
36843 switch (RPOpc) {
36844 case X86::INDIRECT_THUNK_CALL32:
36845 return X86::CALLpcrel32;
36846 case X86::INDIRECT_THUNK_CALL64:
36847 return X86::CALL64pcrel32;
36848 case X86::INDIRECT_THUNK_TCRETURN32:
36849 return X86::TCRETURNdi;
36850 case X86::INDIRECT_THUNK_TCRETURN64:
36851 return X86::TCRETURNdi64;
36852 }
36853 llvm_unreachable("not indirect thunk opcode");
36854}
36855
36856static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
36857 Register Reg) {
36858 if (Subtarget.useRetpolineExternalThunk()) {
36859 // When using an external thunk for retpolines, we pick names that match the
36860 // names GCC happens to use as well. This helps simplify the implementation
36861 // of the thunks for kernels where they have no easy ability to create
36862 // aliases and are doing non-trivial configuration of the thunk's body. For
36863 // example, the Linux kernel will do boot-time hot patching of the thunk
36864 // bodies and cannot easily export aliases of these to loaded modules.
36865 //
36866 // Note that at any point in the future, we may need to change the semantics
36867 // of how we implement retpolines and at that time will likely change the
36868 // name of the called thunk. Essentially, there is no hard guarantee that
36869 // LLVM will generate calls to specific thunks, we merely make a best-effort
36870 // attempt to help out kernels and other systems where duplicating the
36871 // thunks is costly.
36872 switch (Reg.id()) {
36873 case X86::EAX:
36874 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36875 return "__x86_indirect_thunk_eax";
36876 case X86::ECX:
36877 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36878 return "__x86_indirect_thunk_ecx";
36879 case X86::EDX:
36880 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36881 return "__x86_indirect_thunk_edx";
36882 case X86::EDI:
36883 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36884 return "__x86_indirect_thunk_edi";
36885 case X86::R11:
36886 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36887 return "__x86_indirect_thunk_r11";
36888 }
36889 llvm_unreachable("unexpected reg for external indirect thunk");
36890 }
36891
36892 if (Subtarget.useRetpolineIndirectCalls() ||
36893 Subtarget.useRetpolineIndirectBranches()) {
36894 // When targeting an internal COMDAT thunk use an LLVM-specific name.
36895 switch (Reg.id()) {
36896 case X86::EAX:
36897 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36898 return "__llvm_retpoline_eax";
36899 case X86::ECX:
36900 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36901 return "__llvm_retpoline_ecx";
36902 case X86::EDX:
36903 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36904 return "__llvm_retpoline_edx";
36905 case X86::EDI:
36906 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36907 return "__llvm_retpoline_edi";
36908 case X86::R11:
36909 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36910 return "__llvm_retpoline_r11";
36911 }
36912 llvm_unreachable("unexpected reg for retpoline");
36913 }
36914
36915 if (Subtarget.useLVIControlFlowIntegrity()) {
36916 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36917 return "__llvm_lvi_thunk_r11";
36918 }
36919 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
36920}
36921
36923X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
36924 MachineBasicBlock *BB) const {
36925 // Copy the virtual register into the R11 physical register and
36926 // call the retpoline thunk.
36927 const MIMetadata MIMD(MI);
36928 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36929 Register CalleeVReg = MI.getOperand(0).getReg();
36930 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
36931
36932 // Find an available scratch register to hold the callee. On 64-bit, we can
36933 // just use R11, but we scan for uses anyway to ensure we don't generate
36934 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
36935 // already a register use operand to the call to hold the callee. If none
36936 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
36937 // register and ESI is the base pointer to realigned stack frames with VLAs.
36938 SmallVector<Register, 3> AvailableRegs;
36939 if (Subtarget.is64Bit())
36940 AvailableRegs.push_back(X86::R11);
36941 else
36942 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
36943
36944 // Zero out any registers that are already used.
36945 for (const auto &MO : MI.operands()) {
36946 if (MO.isReg() && MO.isUse())
36947 llvm::replace(AvailableRegs, MO.getReg(), Register());
36948 }
36949
36950 // Choose the first remaining non-zero available register.
36951 Register AvailableReg;
36952 for (Register MaybeReg : AvailableRegs) {
36953 if (MaybeReg) {
36954 AvailableReg = MaybeReg;
36955 break;
36956 }
36957 }
36958 if (!AvailableReg)
36959 report_fatal_error("calling convention incompatible with retpoline, no "
36960 "available registers");
36961
36962 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
36963
36964 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
36965 .addReg(CalleeVReg);
36966 MI.getOperand(0).ChangeToES(Symbol);
36967 MI.setDesc(TII->get(Opc));
36968 MachineInstrBuilder(*BB->getParent(), &MI)
36969 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
36970 return BB;
36971}
36972
36973/// SetJmp implies future control flow change upon calling the corresponding
36974/// LongJmp.
36975/// Instead of using the 'return' instruction, the long jump fixes the stack and
36976/// performs an indirect branch. To do so it uses the registers that were stored
36977/// in the jump buffer (when calling SetJmp).
36978/// In case the shadow stack is enabled we need to fix it as well, because some
36979/// return addresses will be skipped.
36980/// The function will save the SSP for future fixing in the function
36981/// emitLongJmpShadowStackFix.
36982/// \sa emitLongJmpShadowStackFix
36983/// \param [in] MI The temporary Machine Instruction for the builtin.
36984/// \param [in] MBB The Machine Basic Block that will be modified.
36985void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
36986 MachineBasicBlock *MBB) const {
36987 const MIMetadata MIMD(MI);
36988 MachineFunction *MF = MBB->getParent();
36989 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36990 MachineRegisterInfo &MRI = MF->getRegInfo();
36991 MachineInstrBuilder MIB;
36992
36993 // Memory Reference.
36994 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
36995
36996 // Initialize a register with zero.
36997 MVT PVT = getPointerTy(MF->getDataLayout());
36998 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
36999 Register ZReg = MRI.createVirtualRegister(PtrRC);
37000 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
37001 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
37002 .addDef(ZReg)
37003 .addReg(ZReg, RegState::Undef)
37004 .addReg(ZReg, RegState::Undef);
37005
37006 // Read the current SSP Register value to the zeroed register.
37007 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37008 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37009 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37010
37011 // Write the SSP register value to offset 3 in input memory buffer.
37012 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37013 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
37014 const int64_t SSPOffset = 3 * PVT.getStoreSize();
37015 const unsigned MemOpndSlot = 1;
37016 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37017 if (i == X86::AddrDisp)
37018 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
37019 else
37020 MIB.add(MI.getOperand(MemOpndSlot + i));
37021 }
37022 MIB.addReg(SSPCopyReg);
37023 MIB.setMemRefs(MMOs);
37024}
37025
37027X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
37028 MachineBasicBlock *MBB) const {
37029 const MIMetadata MIMD(MI);
37030 MachineFunction *MF = MBB->getParent();
37031 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37032 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
37033 MachineRegisterInfo &MRI = MF->getRegInfo();
37034
37035 const BasicBlock *BB = MBB->getBasicBlock();
37037
37038 // Memory Reference
37039 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37040
37041 unsigned MemOpndSlot = 0;
37042
37043 unsigned CurOp = 0;
37044
37045 Register DstReg = MI.getOperand(CurOp++).getReg();
37046 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
37047 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
37048 (void)TRI;
37049 Register mainDstReg = MRI.createVirtualRegister(RC);
37050 Register restoreDstReg = MRI.createVirtualRegister(RC);
37051
37052 MemOpndSlot = CurOp;
37053
37054 MVT PVT = getPointerTy(MF->getDataLayout());
37055 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37056 "Invalid Pointer Size!");
37057
37058 // For v = setjmp(buf), we generate
37059 //
37060 // thisMBB:
37061 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
37062 // SjLjSetup restoreMBB
37063 //
37064 // mainMBB:
37065 // v_main = 0
37066 //
37067 // sinkMBB:
37068 // v = phi(main, restore)
37069 //
37070 // restoreMBB:
37071 // if base pointer being used, load it from frame
37072 // v_restore = 1
37073
37074 MachineBasicBlock *thisMBB = MBB;
37075 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
37076 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37077 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
37078 MF->insert(I, mainMBB);
37079 MF->insert(I, sinkMBB);
37080 MF->push_back(restoreMBB);
37081 restoreMBB->setMachineBlockAddressTaken();
37082
37083 MachineInstrBuilder MIB;
37084
37085 // Transfer the remainder of BB and its successor edges to sinkMBB.
37086 sinkMBB->splice(sinkMBB->begin(), MBB,
37087 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
37089
37090 // thisMBB:
37091 unsigned PtrStoreOpc = 0;
37092 Register LabelReg;
37093 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37094 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37096
37097 // Prepare IP either in reg or imm.
37098 if (!UseImmLabel) {
37099 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37100 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37101 LabelReg = MRI.createVirtualRegister(PtrRC);
37102 if (Subtarget.is64Bit()) {
37103 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
37104 .addReg(X86::RIP)
37105 .addImm(0)
37106 .addReg(0)
37107 .addMBB(restoreMBB)
37108 .addReg(0);
37109 } else {
37110 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
37111 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
37112 .addReg(XII->getGlobalBaseReg(MF))
37113 .addImm(0)
37114 .addReg(0)
37115 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
37116 .addReg(0);
37117 }
37118 } else
37119 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37120 // Store IP
37121 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
37122 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37123 if (i == X86::AddrDisp)
37124 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
37125 else
37126 MIB.add(MI.getOperand(MemOpndSlot + i));
37127 }
37128 if (!UseImmLabel)
37129 MIB.addReg(LabelReg);
37130 else
37131 MIB.addMBB(restoreMBB);
37132 MIB.setMemRefs(MMOs);
37133
37134 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37135 emitSetJmpShadowStackFix(MI, thisMBB);
37136 }
37137
37138 // Setup
37139 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
37140 .addMBB(restoreMBB);
37141
37142 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37143 MIB.addRegMask(RegInfo->getNoPreservedMask());
37144 thisMBB->addSuccessor(mainMBB);
37145 thisMBB->addSuccessor(restoreMBB);
37146
37147 // mainMBB:
37148 // EAX = 0
37149 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
37150 mainMBB->addSuccessor(sinkMBB);
37151
37152 // sinkMBB:
37153 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
37154 .addReg(mainDstReg)
37155 .addMBB(mainMBB)
37156 .addReg(restoreDstReg)
37157 .addMBB(restoreMBB);
37158
37159 // restoreMBB:
37160 if (RegInfo->hasBasePointer(*MF)) {
37161 const bool Uses64BitFramePtr = Subtarget.isTarget64BitLP64();
37162 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
37163 X86FI->setRestoreBasePointer(MF);
37164 Register FramePtr = RegInfo->getFrameRegister(*MF);
37165 Register BasePtr = RegInfo->getBaseRegister();
37166 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
37167 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
37168 FramePtr, true, X86FI->getRestoreBasePointerOffset())
37170 }
37171 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
37172 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
37173 restoreMBB->addSuccessor(sinkMBB);
37174
37175 MI.eraseFromParent();
37176 return sinkMBB;
37177}
37178
37179/// Fix the shadow stack using the previously saved SSP pointer.
37180/// \sa emitSetJmpShadowStackFix
37181/// \param [in] MI The temporary Machine Instruction for the builtin.
37182/// \param [in] MBB The Machine Basic Block that will be modified.
37183/// \return The sink MBB that will perform the future indirect branch.
37185X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
37186 MachineBasicBlock *MBB) const {
37187 const MIMetadata MIMD(MI);
37188 MachineFunction *MF = MBB->getParent();
37189 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37190 MachineRegisterInfo &MRI = MF->getRegInfo();
37191
37192 // Memory Reference
37193 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37194
37195 MVT PVT = getPointerTy(MF->getDataLayout());
37196 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37197
37198 // checkSspMBB:
37199 // xor vreg1, vreg1
37200 // rdssp vreg1
37201 // test vreg1, vreg1
37202 // je sinkMBB # Jump if Shadow Stack is not supported
37203 // fallMBB:
37204 // mov buf+24/12(%rip), vreg2
37205 // sub vreg1, vreg2
37206 // jbe sinkMBB # No need to fix the Shadow Stack
37207 // fixShadowMBB:
37208 // shr 3/2, vreg2
37209 // incssp vreg2 # fix the SSP according to the lower 8 bits
37210 // shr 8, vreg2
37211 // je sinkMBB
37212 // fixShadowLoopPrepareMBB:
37213 // shl vreg2
37214 // mov 128, vreg3
37215 // fixShadowLoopMBB:
37216 // incssp vreg3
37217 // dec vreg2
37218 // jne fixShadowLoopMBB # Iterate until you finish fixing
37219 // # the Shadow Stack
37220 // sinkMBB:
37221
37223 const BasicBlock *BB = MBB->getBasicBlock();
37224
37225 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
37226 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
37227 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
37228 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
37229 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
37230 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37231 MF->insert(I, checkSspMBB);
37232 MF->insert(I, fallMBB);
37233 MF->insert(I, fixShadowMBB);
37234 MF->insert(I, fixShadowLoopPrepareMBB);
37235 MF->insert(I, fixShadowLoopMBB);
37236 MF->insert(I, sinkMBB);
37237
37238 // Transfer the remainder of BB and its successor edges to sinkMBB.
37239 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
37240 MBB->end());
37242
37243 MBB->addSuccessor(checkSspMBB);
37244
37245 // Initialize a register with zero.
37246 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
37247 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
37248
37249 if (PVT == MVT::i64) {
37250 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
37251 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
37252 .addImm(0)
37253 .addReg(ZReg)
37254 .addImm(X86::sub_32bit);
37255 ZReg = TmpZReg;
37256 }
37257
37258 // Read the current SSP Register value to the zeroed register.
37259 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37260 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37261 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37262
37263 // Check whether the result of the SSP register is zero and jump directly
37264 // to the sink.
37265 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
37266 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
37267 .addReg(SSPCopyReg)
37268 .addReg(SSPCopyReg);
37269 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
37270 .addMBB(sinkMBB)
37272 checkSspMBB->addSuccessor(sinkMBB);
37273 checkSspMBB->addSuccessor(fallMBB);
37274
37275 // Reload the previously saved SSP register value.
37276 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
37277 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37278 const int64_t SPPOffset = 3 * PVT.getStoreSize();
37279 MachineInstrBuilder MIB =
37280 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
37281 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37282 const MachineOperand &MO = MI.getOperand(i);
37283 if (i == X86::AddrDisp)
37284 MIB.addDisp(MO, SPPOffset);
37285 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37286 // preserve kill flags.
37287 MIB.addReg(MO.getReg());
37288 else
37289 MIB.add(MO);
37290 }
37291 MIB.setMemRefs(MMOs);
37292
37293 // Subtract the current SSP from the previous SSP.
37294 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
37295 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
37296 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
37297 .addReg(PrevSSPReg)
37298 .addReg(SSPCopyReg);
37299
37300 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
37301 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
37302 .addMBB(sinkMBB)
37304 fallMBB->addSuccessor(sinkMBB);
37305 fallMBB->addSuccessor(fixShadowMBB);
37306
37307 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
37308 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
37309 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
37310 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
37311 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
37312 .addReg(SspSubReg)
37313 .addImm(Offset);
37314
37315 // Increase SSP when looking only on the lower 8 bits of the delta.
37316 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
37317 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
37318
37319 // Reset the lower 8 bits.
37320 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
37321 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
37322 .addReg(SspFirstShrReg)
37323 .addImm(8);
37324
37325 // Jump if the result of the shift is zero.
37326 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
37327 .addMBB(sinkMBB)
37329 fixShadowMBB->addSuccessor(sinkMBB);
37330 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
37331
37332 // Do a single shift left.
37333 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
37334 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
37335 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
37336 .addReg(SspSecondShrReg)
37337 .addImm(1);
37338
37339 // Save the value 128 to a register (will be used next with incssp).
37340 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
37341 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
37342 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
37343 .addImm(128);
37344 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
37345
37346 // Since incssp only looks at the lower 8 bits, we might need to do several
37347 // iterations of incssp until we finish fixing the shadow stack.
37348 Register DecReg = MRI.createVirtualRegister(PtrRC);
37349 Register CounterReg = MRI.createVirtualRegister(PtrRC);
37350 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
37351 .addReg(SspAfterShlReg)
37352 .addMBB(fixShadowLoopPrepareMBB)
37353 .addReg(DecReg)
37354 .addMBB(fixShadowLoopMBB);
37355
37356 // Every iteration we increase the SSP by 128.
37357 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
37358
37359 // Every iteration we decrement the counter by 1.
37360 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
37361 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
37362
37363 // Jump if the counter is not zero yet.
37364 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
37365 .addMBB(fixShadowLoopMBB)
37367 fixShadowLoopMBB->addSuccessor(sinkMBB);
37368 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
37369
37370 return sinkMBB;
37371}
37372
37374X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
37375 MachineBasicBlock *MBB) const {
37376 const MIMetadata MIMD(MI);
37377 MachineFunction *MF = MBB->getParent();
37378 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37379 MachineRegisterInfo &MRI = MF->getRegInfo();
37380
37381 // Memory Reference
37382 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37383
37384 MVT PVT = getPointerTy(MF->getDataLayout());
37385 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37386 "Invalid Pointer Size!");
37387
37388 const TargetRegisterClass *RC =
37389 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37390 Register Tmp = MRI.createVirtualRegister(RC);
37391 // Since FP is only updated here but NOT referenced, it's treated as GPR.
37392 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37393 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
37394 Register SP = RegInfo->getStackRegister();
37395
37396 MachineInstrBuilder MIB;
37397
37398 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37399 const int64_t SPOffset = 2 * PVT.getStoreSize();
37400
37401 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37402 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
37403
37404 MachineBasicBlock *thisMBB = MBB;
37405
37406 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
37407 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37408 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
37409 }
37410
37411 // Reload FP
37412 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
37413 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37414 const MachineOperand &MO = MI.getOperand(i);
37415 if (MO.isReg()) // Don't add the whole operand, we don't want to
37416 // preserve kill flags.
37417 MIB.addReg(MO.getReg());
37418 else
37419 MIB.add(MO);
37420 }
37421 MIB.setMemRefs(MMOs);
37423
37424 // Reload IP
37425 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
37426 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37427 const MachineOperand &MO = MI.getOperand(i);
37428 if (i == X86::AddrDisp)
37429 MIB.addDisp(MO, LabelOffset);
37430 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37431 // preserve kill flags.
37432 MIB.addReg(MO.getReg());
37433 else
37434 MIB.add(MO);
37435 }
37436 MIB.setMemRefs(MMOs);
37437
37438 // Reload SP
37439 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
37440 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37441 if (i == X86::AddrDisp)
37442 MIB.addDisp(MI.getOperand(i), SPOffset);
37443 else
37444 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
37445 // the last instruction of the expansion.
37446 }
37447 MIB.setMemRefs(MMOs);
37449
37450 // Jump
37451 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
37452
37453 MI.eraseFromParent();
37454 return thisMBB;
37455}
37456
37457void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
37459 MachineBasicBlock *DispatchBB,
37460 int FI) const {
37461 const MIMetadata MIMD(MI);
37462 MachineFunction *MF = MBB->getParent();
37463 MachineRegisterInfo *MRI = &MF->getRegInfo();
37464 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37465
37466 MVT PVT = getPointerTy(MF->getDataLayout());
37467 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
37468
37469 unsigned Op = 0;
37470 Register VR;
37471
37472 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37474
37475 if (UseImmLabel) {
37476 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37477 } else {
37478 const TargetRegisterClass *TRC =
37479 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37480 VR = MRI->createVirtualRegister(TRC);
37481 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37482
37483 if (Subtarget.is64Bit())
37484 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
37485 .addReg(X86::RIP)
37486 .addImm(1)
37487 .addReg(0)
37488 .addMBB(DispatchBB)
37489 .addReg(0);
37490 else
37491 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
37492 .addReg(0) /* TII->getGlobalBaseReg(MF) */
37493 .addImm(1)
37494 .addReg(0)
37495 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
37496 .addReg(0);
37497 }
37498
37499 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
37500 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
37501 if (UseImmLabel)
37502 MIB.addMBB(DispatchBB);
37503 else
37504 MIB.addReg(VR);
37505}
37506
37508X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
37509 MachineBasicBlock *BB) const {
37510 const MIMetadata MIMD(MI);
37511 MachineFunction *MF = BB->getParent();
37512 MachineRegisterInfo *MRI = &MF->getRegInfo();
37513 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37514 int FI = MF->getFrameInfo().getFunctionContextIndex();
37515
37516 // Get a mapping of the call site numbers to all of the landing pads they're
37517 // associated with.
37518 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
37519 unsigned MaxCSNum = 0;
37520 for (auto &MBB : *MF) {
37521 if (!MBB.isEHPad())
37522 continue;
37523
37524 MCSymbol *Sym = nullptr;
37525 for (const auto &MI : MBB) {
37526 if (MI.isDebugInstr())
37527 continue;
37528
37529 assert(MI.isEHLabel() && "expected EH_LABEL");
37530 Sym = MI.getOperand(0).getMCSymbol();
37531 break;
37532 }
37533
37534 if (!MF->hasCallSiteLandingPad(Sym))
37535 continue;
37536
37537 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
37538 CallSiteNumToLPad[CSI].push_back(&MBB);
37539 MaxCSNum = std::max(MaxCSNum, CSI);
37540 }
37541 }
37542
37543 // Get an ordered list of the machine basic blocks for the jump table.
37544 std::vector<MachineBasicBlock *> LPadList;
37545 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
37546 LPadList.reserve(CallSiteNumToLPad.size());
37547
37548 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
37549 for (auto &LP : CallSiteNumToLPad[CSI]) {
37550 LPadList.push_back(LP);
37551 InvokeBBs.insert_range(LP->predecessors());
37552 }
37553 }
37554
37555 assert(!LPadList.empty() &&
37556 "No landing pad destinations for the dispatch jump table!");
37557
37558 // Create the MBBs for the dispatch code.
37559
37560 // Shove the dispatch's address into the return slot in the function context.
37561 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
37562 DispatchBB->setIsEHPad(true);
37563
37564 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
37565 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
37566 DispatchBB->addSuccessor(TrapBB);
37567
37568 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
37569 DispatchBB->addSuccessor(DispContBB);
37570
37571 // Insert MBBs.
37572 MF->push_back(DispatchBB);
37573 MF->push_back(DispContBB);
37574 MF->push_back(TrapBB);
37575
37576 // Insert code into the entry block that creates and registers the function
37577 // context.
37578 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
37579
37580 // Create the jump table and associated information
37581 unsigned JTE = getJumpTableEncoding();
37582 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
37583 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
37584
37585 const X86RegisterInfo &RI = TII->getRegisterInfo();
37586 // Add a register mask with no preserved registers. This results in all
37587 // registers being marked as clobbered.
37588 if (RI.hasBasePointer(*MF)) {
37589 const bool FPIs64Bit = Subtarget.isTarget64BitLP64();
37590 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
37591 MFI->setRestoreBasePointer(MF);
37592
37593 Register FP = RI.getFrameRegister(*MF);
37594 Register BP = RI.getBaseRegister();
37595 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
37596 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
37599 } else {
37600 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
37602 }
37603
37604 // IReg is used as an index in a memory operand and therefore can't be SP
37605 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
37606 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
37607 Subtarget.is64Bit() ? 8 : 4);
37608 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
37609 .addReg(IReg)
37610 .addImm(LPadList.size());
37611 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
37612 .addMBB(TrapBB)
37614
37615 if (Subtarget.is64Bit()) {
37616 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37617 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
37618
37619 // leaq .LJTI0_0(%rip), BReg
37620 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
37621 .addReg(X86::RIP)
37622 .addImm(1)
37623 .addReg(0)
37624 .addJumpTableIndex(MJTI)
37625 .addReg(0);
37626 // movzx IReg64, IReg
37627 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
37628 .addImm(0)
37629 .addReg(IReg)
37630 .addImm(X86::sub_32bit);
37631
37632 switch (JTE) {
37634 // jmpq *(BReg,IReg64,8)
37635 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
37636 .addReg(BReg)
37637 .addImm(8)
37638 .addReg(IReg64)
37639 .addImm(0)
37640 .addReg(0);
37641 break;
37643 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
37644 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
37645 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37646
37647 // movl (BReg,IReg64,4), OReg
37648 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
37649 .addReg(BReg)
37650 .addImm(4)
37651 .addReg(IReg64)
37652 .addImm(0)
37653 .addReg(0);
37654 // movsx OReg64, OReg
37655 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
37656 .addReg(OReg);
37657 // addq BReg, OReg64, TReg
37658 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
37659 .addReg(OReg64)
37660 .addReg(BReg);
37661 // jmpq *TReg
37662 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
37663 break;
37664 }
37665 default:
37666 llvm_unreachable("Unexpected jump table encoding");
37667 }
37668 } else {
37669 // jmpl *.LJTI0_0(,IReg,4)
37670 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
37671 .addReg(0)
37672 .addImm(4)
37673 .addReg(IReg)
37674 .addJumpTableIndex(MJTI)
37675 .addReg(0);
37676 }
37677
37678 // Add the jump table entries as successors to the MBB.
37679 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
37680 for (auto &LP : LPadList)
37681 if (SeenMBBs.insert(LP).second)
37682 DispContBB->addSuccessor(LP);
37683
37684 // N.B. the order the invoke BBs are processed in doesn't matter here.
37686 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
37687 for (MachineBasicBlock *MBB : InvokeBBs) {
37688 // Remove the landing pad successor from the invoke block and replace it
37689 // with the new dispatch block.
37690 // Keep a copy of Successors since it's modified inside the loop.
37691 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
37692 MBB->succ_rend());
37693 // FIXME: Avoid quadratic complexity.
37694 for (auto *MBBS : Successors) {
37695 if (MBBS->isEHPad()) {
37696 MBB->removeSuccessor(MBBS);
37697 MBBLPads.push_back(MBBS);
37698 }
37699 }
37700
37701 MBB->addSuccessor(DispatchBB);
37702
37703 // Find the invoke call and mark all of the callee-saved registers as
37704 // 'implicit defined' so that they're spilled. This prevents code from
37705 // moving instructions to before the EH block, where they will never be
37706 // executed.
37707 for (auto &II : reverse(*MBB)) {
37708 if (!II.isCall())
37709 continue;
37710
37711 DenseSet<Register> DefRegs;
37712 for (auto &MOp : II.operands())
37713 if (MOp.isReg())
37714 DefRegs.insert(MOp.getReg());
37715
37716 MachineInstrBuilder MIB(*MF, &II);
37717 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
37718 Register Reg = SavedRegs[RegIdx];
37719 if (!DefRegs.contains(Reg))
37721 }
37722
37723 break;
37724 }
37725 }
37726
37727 // Mark all former landing pads as non-landing pads. The dispatch is the only
37728 // landing pad now.
37729 for (auto &LP : MBBLPads)
37730 LP->setIsEHPad(false);
37731
37732 // The instruction is gone now.
37733 MI.eraseFromParent();
37734 return BB;
37735}
37736
37738X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
37739 MachineBasicBlock *BB) const {
37740 // Wrap patchable event calls in CALLSEQ_START/CALLSEQ_END, as tracing
37741 // calls may require proper stack alignment.
37742 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
37743 const MIMetadata MIMD(MI);
37744 MachineFunction &MF = *BB->getParent();
37745
37746 // Emit CALLSEQ_START right before the instruction.
37747 MF.getFrameInfo().setAdjustsStack(true);
37748 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
37749 MachineInstrBuilder CallseqStart =
37750 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
37751 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
37752
37753 // Emit CALLSEQ_END right after the instruction.
37754 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
37755 MachineInstrBuilder CallseqEnd =
37756 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
37757 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
37758
37759 return BB;
37760}
37761
37764 MachineBasicBlock *BB) const {
37765 MachineFunction *MF = BB->getParent();
37766 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37767 const MIMetadata MIMD(MI);
37768
37769 auto TMMImmToTMMReg = [](unsigned Imm) {
37770 assert (Imm < 8 && "Illegal tmm index");
37771 return X86::TMM0 + Imm;
37772 };
37773 auto TMMImmToTMMPair = [](unsigned Imm) {
37774 assert(Imm < 8 && "Illegal tmm pair index.");
37775 return X86::TMM0_TMM1 + Imm / 2;
37776 };
37777 switch (MI.getOpcode()) {
37778 default:
37779 llvm_unreachable("Unexpected instr type to insert");
37780 case X86::INDIRECT_THUNK_CALL32:
37781 case X86::INDIRECT_THUNK_CALL64:
37782 case X86::INDIRECT_THUNK_TCRETURN32:
37783 case X86::INDIRECT_THUNK_TCRETURN64:
37784 return EmitLoweredIndirectThunk(MI, BB);
37785 case X86::CATCHRET:
37786 return EmitLoweredCatchRet(MI, BB);
37787 case X86::SEG_ALLOCA_32:
37788 case X86::SEG_ALLOCA_64:
37789 return EmitLoweredSegAlloca(MI, BB);
37790 case X86::PROBED_ALLOCA_32:
37791 case X86::PROBED_ALLOCA_64:
37792 return EmitLoweredProbedAlloca(MI, BB);
37793 case X86::TLSCall_32:
37794 case X86::TLSCall_64:
37795 return EmitLoweredTLSCall(MI, BB);
37796 case X86::CMOV_FR16:
37797 case X86::CMOV_FR16X:
37798 case X86::CMOV_FR32:
37799 case X86::CMOV_FR32X:
37800 case X86::CMOV_FR64:
37801 case X86::CMOV_FR64X:
37802 case X86::CMOV_GR8:
37803 case X86::CMOV_GR16:
37804 case X86::CMOV_GR32:
37805 case X86::CMOV_RFP32:
37806 case X86::CMOV_RFP64:
37807 case X86::CMOV_RFP80:
37808 case X86::CMOV_VR64:
37809 case X86::CMOV_VR128:
37810 case X86::CMOV_VR128X:
37811 case X86::CMOV_VR256:
37812 case X86::CMOV_VR256X:
37813 case X86::CMOV_VR512:
37814 case X86::CMOV_VK1:
37815 case X86::CMOV_VK2:
37816 case X86::CMOV_VK4:
37817 case X86::CMOV_VK8:
37818 case X86::CMOV_VK16:
37819 case X86::CMOV_VK32:
37820 case X86::CMOV_VK64:
37821 return EmitLoweredSelect(MI, BB);
37822
37823 case X86::FP80_ADDr:
37824 case X86::FP80_ADDm32: {
37825 // Change the floating point control register to use double extended
37826 // precision when performing the addition.
37827 int OrigCWFrameIdx =
37828 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37829 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37830 OrigCWFrameIdx);
37831
37832 // Load the old value of the control word...
37833 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37834 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37835 OrigCWFrameIdx);
37836
37837 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
37838 // precision.
37839 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37840 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37841 .addReg(OldCW, RegState::Kill)
37842 .addImm(0x300);
37843
37844 // Extract to 16 bits.
37845 Register NewCW16 =
37846 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37847 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37848 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37849
37850 // Prepare memory for FLDCW.
37851 int NewCWFrameIdx =
37852 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37853 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37854 NewCWFrameIdx)
37855 .addReg(NewCW16, RegState::Kill);
37856
37857 // Reload the modified control word now...
37858 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37859 NewCWFrameIdx);
37860
37861 // Do the addition.
37862 if (MI.getOpcode() == X86::FP80_ADDr) {
37863 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
37864 .add(MI.getOperand(0))
37865 .add(MI.getOperand(1))
37866 .add(MI.getOperand(2));
37867 } else {
37868 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
37869 .add(MI.getOperand(0))
37870 .add(MI.getOperand(1))
37871 .add(MI.getOperand(2))
37872 .add(MI.getOperand(3))
37873 .add(MI.getOperand(4))
37874 .add(MI.getOperand(5))
37875 .add(MI.getOperand(6));
37876 }
37877
37878 // Reload the original control word now.
37879 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37880 OrigCWFrameIdx);
37881
37882 MI.eraseFromParent(); // The pseudo instruction is gone now.
37883 return BB;
37884 }
37885
37886 case X86::FP32_TO_INT16_IN_MEM:
37887 case X86::FP32_TO_INT32_IN_MEM:
37888 case X86::FP32_TO_INT64_IN_MEM:
37889 case X86::FP64_TO_INT16_IN_MEM:
37890 case X86::FP64_TO_INT32_IN_MEM:
37891 case X86::FP64_TO_INT64_IN_MEM:
37892 case X86::FP80_TO_INT16_IN_MEM:
37893 case X86::FP80_TO_INT32_IN_MEM:
37894 case X86::FP80_TO_INT64_IN_MEM: {
37895 // Change the floating point control register to use "round towards zero"
37896 // mode when truncating to an integer value.
37897 int OrigCWFrameIdx =
37898 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37899 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37900 OrigCWFrameIdx);
37901
37902 // Load the old value of the control word...
37903 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37904 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37905 OrigCWFrameIdx);
37906
37907 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
37908 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37909 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37910 .addReg(OldCW, RegState::Kill).addImm(0xC00);
37911
37912 // Extract to 16 bits.
37913 Register NewCW16 =
37914 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37915 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37916 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37917
37918 // Prepare memory for FLDCW.
37919 int NewCWFrameIdx =
37920 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37921 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37922 NewCWFrameIdx)
37923 .addReg(NewCW16, RegState::Kill);
37924
37925 // Reload the modified control word now...
37926 addFrameReference(BuildMI(*BB, MI, MIMD,
37927 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
37928
37929 // Get the X86 opcode to use.
37930 unsigned Opc;
37931 switch (MI.getOpcode()) {
37932 // clang-format off
37933 default: llvm_unreachable("illegal opcode!");
37934 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
37935 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
37936 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
37937 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
37938 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
37939 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
37940 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
37941 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
37942 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
37943 // clang-format on
37944 }
37945
37947 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
37948 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
37949
37950 // Reload the original control word now.
37951 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37952 OrigCWFrameIdx);
37953
37954 MI.eraseFromParent(); // The pseudo instruction is gone now.
37955 return BB;
37956 }
37957
37958 // xbegin
37959 case X86::XBEGIN:
37960 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
37961
37962 case X86::VAARG_64:
37963 case X86::VAARG_X32:
37964 return EmitVAARGWithCustomInserter(MI, BB);
37965
37966 case X86::EH_SjLj_SetJmp32:
37967 case X86::EH_SjLj_SetJmp64:
37968 return emitEHSjLjSetJmp(MI, BB);
37969
37970 case X86::EH_SjLj_LongJmp32:
37971 case X86::EH_SjLj_LongJmp64:
37972 return emitEHSjLjLongJmp(MI, BB);
37973
37974 case X86::Int_eh_sjlj_setup_dispatch:
37975 return EmitSjLjDispatchBlock(MI, BB);
37976
37977 case TargetOpcode::STATEPOINT:
37978 // As an implementation detail, STATEPOINT shares the STACKMAP format at
37979 // this point in the process. We diverge later.
37980 return emitPatchPoint(MI, BB);
37981
37982 case TargetOpcode::STACKMAP:
37983 case TargetOpcode::PATCHPOINT:
37984 return emitPatchPoint(MI, BB);
37985
37986 case TargetOpcode::PATCHABLE_EVENT_CALL:
37987 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
37988 return emitPatchableEventCall(MI, BB);
37989
37990 case X86::LCMPXCHG8B: {
37991 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
37992 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
37993 // requires a memory operand. If it happens that current architecture is
37994 // i686 and for current function we need a base pointer
37995 // - which is ESI for i686 - register allocator would not be able to
37996 // allocate registers for an address in form of X(%reg, %reg, Y)
37997 // - there never would be enough unreserved registers during regalloc
37998 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
37999 // We are giving a hand to register allocator by precomputing the address in
38000 // a new vreg using LEA.
38001
38002 // If it is not i686 or there is no base pointer - nothing to do here.
38003 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
38004 return BB;
38005
38006 // Even though this code does not necessarily needs the base pointer to
38007 // be ESI, we check for that. The reason: if this assert fails, there are
38008 // some changes happened in the compiler base pointer handling, which most
38009 // probably have to be addressed somehow here.
38010 assert(TRI->getBaseRegister() == X86::ESI &&
38011 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
38012 "base pointer in mind");
38013
38015 MVT SPTy = getPointerTy(MF->getDataLayout());
38016 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
38017 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
38018
38020 // Regalloc does not need any help when the memory operand of CMPXCHG8B
38021 // does not use index register.
38022 if (AM.IndexReg == X86::NoRegister)
38023 return BB;
38024
38025 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
38026 // four operand definitions that are E[ABCD] registers. We skip them and
38027 // then insert the LEA.
38028 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
38029 while (RMBBI != BB->rend() &&
38030 (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
38031 RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
38032 RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
38033 RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
38034 ++RMBBI;
38035 }
38038 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
38039
38040 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
38041
38042 return BB;
38043 }
38044 case X86::LCMPXCHG16B_NO_RBX: {
38045 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38046 Register BasePtr = TRI->getBaseRegister();
38047 if (TRI->hasBasePointer(*MF) &&
38048 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
38049 if (!BB->isLiveIn(BasePtr))
38050 BB->addLiveIn(BasePtr);
38051 // Save RBX into a virtual register.
38052 Register SaveRBX =
38053 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38054 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38055 .addReg(X86::RBX);
38056 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38058 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
38059 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38060 MIB.add(MI.getOperand(Idx));
38061 MIB.add(MI.getOperand(X86::AddrNumOperands));
38062 MIB.addReg(SaveRBX);
38063 } else {
38064 // Simple case, just copy the virtual register to RBX.
38065 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
38066 .add(MI.getOperand(X86::AddrNumOperands));
38068 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
38069 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38070 MIB.add(MI.getOperand(Idx));
38071 }
38072 MI.eraseFromParent();
38073 return BB;
38074 }
38075 case X86::MWAITX: {
38076 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38077 Register BasePtr = TRI->getBaseRegister();
38078 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
38079 // If no need to save the base pointer, we generate MWAITXrrr,
38080 // else we generate pseudo MWAITX_SAVE_RBX.
38081 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
38082 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38083 .addReg(MI.getOperand(0).getReg());
38084 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38085 .addReg(MI.getOperand(1).getReg());
38086 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
38087 .addReg(MI.getOperand(2).getReg());
38088 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
38089 MI.eraseFromParent();
38090 } else {
38091 if (!BB->isLiveIn(BasePtr)) {
38092 BB->addLiveIn(BasePtr);
38093 }
38094 // Parameters can be copied into ECX and EAX but not EBX yet.
38095 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38096 .addReg(MI.getOperand(0).getReg());
38097 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38098 .addReg(MI.getOperand(1).getReg());
38099 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
38100 // Save RBX into a virtual register.
38101 Register SaveRBX =
38102 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38103 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38104 .addReg(X86::RBX);
38105 // Generate mwaitx pseudo.
38106 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38107 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
38108 .addDef(Dst) // Destination tied in with SaveRBX.
38109 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
38110 .addUse(SaveRBX); // Save of base pointer.
38111 MI.eraseFromParent();
38112 }
38113 return BB;
38114 }
38115 case TargetOpcode::PREALLOCATED_SETUP: {
38116 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
38117 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38118 MFI->setHasPreallocatedCall(true);
38119 int64_t PreallocatedId = MI.getOperand(0).getImm();
38120 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
38121 assert(StackAdjustment != 0 && "0 stack adjustment");
38122 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
38123 << StackAdjustment << "\n");
38124 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
38125 .addReg(X86::ESP)
38126 .addImm(StackAdjustment);
38127 MI.eraseFromParent();
38128 return BB;
38129 }
38130 case TargetOpcode::PREALLOCATED_ARG: {
38131 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
38132 int64_t PreallocatedId = MI.getOperand(1).getImm();
38133 int64_t ArgIdx = MI.getOperand(2).getImm();
38134 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38135 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
38136 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
38137 << ", arg offset " << ArgOffset << "\n");
38138 // stack pointer + offset
38139 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
38140 MI.getOperand(0).getReg()),
38141 X86::ESP, false, ArgOffset);
38142 MI.eraseFromParent();
38143 return BB;
38144 }
38145 case X86::PTDPBSSD:
38146 case X86::PTDPBSUD:
38147 case X86::PTDPBUSD:
38148 case X86::PTDPBUUD:
38149 case X86::PTDPBF16PS:
38150 case X86::PTDPFP16PS:
38151 case X86::PTCMMIMFP16PS:
38152 case X86::PTCMMRLFP16PS:
38153 case X86::PTDPBF8PS:
38154 case X86::PTDPBHF8PS:
38155 case X86::PTDPHBF8PS:
38156 case X86::PTDPHF8PS:
38157 case X86::PTTDPBF16PS:
38158 case X86::PTTDPFP16PS:
38159 case X86::PTTCMMIMFP16PS:
38160 case X86::PTTCMMRLFP16PS:
38161 case X86::PTCONJTCMMIMFP16PS:
38162 case X86::PTMMULTF32PS:
38163 case X86::PTTMMULTF32PS: {
38164 unsigned Opc;
38165 switch (MI.getOpcode()) {
38166 default: llvm_unreachable("illegal opcode!");
38167 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
38168 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
38169 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
38170 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
38171 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
38172 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
38173 case X86::PTCMMIMFP16PS:
38174 Opc = X86::TCMMIMFP16PS;
38175 break;
38176 case X86::PTCMMRLFP16PS:
38177 Opc = X86::TCMMRLFP16PS;
38178 break;
38179 case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
38180 case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
38181 case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
38182 case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
38183 case X86::PTTDPBF16PS:
38184 Opc = X86::TTDPBF16PS;
38185 break;
38186 case X86::PTTDPFP16PS:
38187 Opc = X86::TTDPFP16PS;
38188 break;
38189 case X86::PTTCMMIMFP16PS:
38190 Opc = X86::TTCMMIMFP16PS;
38191 break;
38192 case X86::PTTCMMRLFP16PS:
38193 Opc = X86::TTCMMRLFP16PS;
38194 break;
38195 case X86::PTCONJTCMMIMFP16PS:
38196 Opc = X86::TCONJTCMMIMFP16PS;
38197 break;
38198 case X86::PTMMULTF32PS:
38199 Opc = X86::TMMULTF32PS;
38200 break;
38201 case X86::PTTMMULTF32PS:
38202 Opc = X86::TTMMULTF32PS;
38203 break;
38204 }
38205
38206 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38207 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38208 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
38209 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38210 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
38211
38212 MI.eraseFromParent(); // The pseudo is gone now.
38213 return BB;
38214 }
38215 case X86::PTILEZERO: {
38216 unsigned Imm = MI.getOperand(0).getImm();
38217 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
38218 MI.eraseFromParent(); // The pseudo is gone now.
38219 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38221 return BB;
38222 }
38223 case X86::PTILEZEROV: {
38224 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38226 return BB;
38227 }
38228 case X86::PTILELOADDRS:
38229 case X86::PTILELOADDRST1:
38230 case X86::PTILELOADD:
38231 case X86::PTILELOADDT1:
38232 case X86::PTILESTORED: {
38233 unsigned Opc;
38234 switch (MI.getOpcode()) {
38235 default: llvm_unreachable("illegal opcode!");
38236#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38237 case X86::PTILELOADD:
38238 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
38239 break;
38240 case X86::PTILELOADDT1:
38241 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
38242 break;
38243 case X86::PTILESTORED:
38244 Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
38245 break;
38246 case X86::PTILELOADDRS:
38247 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);
38248 break;
38249 case X86::PTILELOADDRST1:
38250 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);
38251 break;
38252 }
38253#undef GET_EGPR_IF_ENABLED
38254
38255 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38256 unsigned CurOp = 0;
38257 if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
38258 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38260
38261 MIB.add(MI.getOperand(CurOp++)); // base
38262 MIB.add(MI.getOperand(CurOp++)); // scale
38263 MIB.add(MI.getOperand(CurOp++)); // index -- stride
38264 MIB.add(MI.getOperand(CurOp++)); // displacement
38265 MIB.add(MI.getOperand(CurOp++)); // segment
38266
38267 if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
38268 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38270
38271 MI.eraseFromParent(); // The pseudo is gone now.
38272 return BB;
38273 }
38274 case X86::PT2RPNTLVWZ0:
38275 case X86::PT2RPNTLVWZ0T1:
38276 case X86::PT2RPNTLVWZ1:
38277 case X86::PT2RPNTLVWZ1T1:
38278 case X86::PT2RPNTLVWZ0RS:
38279 case X86::PT2RPNTLVWZ0RST1:
38280 case X86::PT2RPNTLVWZ1RS:
38281 case X86::PT2RPNTLVWZ1RST1: {
38282 const DebugLoc &DL = MI.getDebugLoc();
38283 unsigned Opc;
38284#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38285 switch (MI.getOpcode()) {
38286 default:
38287 llvm_unreachable("Unexpected instruction!");
38288 case X86::PT2RPNTLVWZ0:
38289 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
38290 break;
38291 case X86::PT2RPNTLVWZ0T1:
38292 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
38293 break;
38294 case X86::PT2RPNTLVWZ1:
38295 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
38296 break;
38297 case X86::PT2RPNTLVWZ1T1:
38298 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
38299 break;
38300 case X86::PT2RPNTLVWZ0RS:
38301 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
38302 break;
38303 case X86::PT2RPNTLVWZ0RST1:
38304 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
38305 break;
38306 case X86::PT2RPNTLVWZ1RS:
38307 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
38308 break;
38309 case X86::PT2RPNTLVWZ1RST1:
38310 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
38311 break;
38312 }
38313#undef GET_EGPR_IF_ENABLED
38314 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38315 MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
38316
38317 MIB.add(MI.getOperand(1)); // base
38318 MIB.add(MI.getOperand(2)); // scale
38319 MIB.add(MI.getOperand(3)); // index
38320 MIB.add(MI.getOperand(4)); // displacement
38321 MIB.add(MI.getOperand(5)); // segment
38322 MI.eraseFromParent(); // The pseudo is gone now.
38323 return BB;
38324 }
38325 case X86::PTTRANSPOSED:
38326 case X86::PTCONJTFP16: {
38327 const DebugLoc &DL = MI.getDebugLoc();
38328 unsigned Opc = MI.getOpcode() == X86::PTTRANSPOSED ? X86::TTRANSPOSED
38329 : X86::TCONJTFP16;
38330
38331 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38332 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38333 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38334
38335 MI.eraseFromParent(); // The pseudo is gone now.
38336 return BB;
38337 }
38338 case X86::PTCVTROWPS2BF16Hrri:
38339 case X86::PTCVTROWPS2BF16Lrri:
38340 case X86::PTCVTROWPS2PHHrri:
38341 case X86::PTCVTROWPS2PHLrri:
38342 case X86::PTCVTROWD2PSrri:
38343 case X86::PTILEMOVROWrri: {
38344 const DebugLoc &DL = MI.getDebugLoc();
38345 unsigned Opc;
38346 switch (MI.getOpcode()) {
38347 default:
38348 llvm_unreachable("Unexpected instruction!");
38349 case X86::PTCVTROWD2PSrri:
38350 Opc = X86::TCVTROWD2PSrri;
38351 break;
38352 case X86::PTCVTROWPS2BF16Hrri:
38353 Opc = X86::TCVTROWPS2BF16Hrri;
38354 break;
38355 case X86::PTCVTROWPS2PHHrri:
38356 Opc = X86::TCVTROWPS2PHHrri;
38357 break;
38358 case X86::PTCVTROWPS2BF16Lrri:
38359 Opc = X86::TCVTROWPS2BF16Lrri;
38360 break;
38361 case X86::PTCVTROWPS2PHLrri:
38362 Opc = X86::TCVTROWPS2PHLrri;
38363 break;
38364 case X86::PTILEMOVROWrri:
38365 Opc = X86::TILEMOVROWrri;
38366 break;
38367 }
38368 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38369 MIB.add(MI.getOperand(0));
38370 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38371 MIB.addImm(MI.getOperand(2).getImm());
38372
38373 MI.eraseFromParent(); // The pseudo is gone now.
38374 return BB;
38375 }
38376 case X86::PTCVTROWPS2BF16Hrre:
38377 case X86::PTCVTROWPS2BF16Lrre:
38378 case X86::PTCVTROWPS2PHHrre:
38379 case X86::PTCVTROWPS2PHLrre:
38380 case X86::PTCVTROWD2PSrre:
38381 case X86::PTILEMOVROWrre: {
38382 const DebugLoc &DL = MI.getDebugLoc();
38383 unsigned Opc;
38384 switch (MI.getOpcode()) {
38385 default:
38386 llvm_unreachable("Unexpected instruction!");
38387 case X86::PTCVTROWD2PSrre:
38388 Opc = X86::TCVTROWD2PSrre;
38389 break;
38390 case X86::PTCVTROWPS2BF16Hrre:
38391 Opc = X86::TCVTROWPS2BF16Hrre;
38392 break;
38393 case X86::PTCVTROWPS2BF16Lrre:
38394 Opc = X86::TCVTROWPS2BF16Lrre;
38395 break;
38396 case X86::PTCVTROWPS2PHHrre:
38397 Opc = X86::TCVTROWPS2PHHrre;
38398 break;
38399 case X86::PTCVTROWPS2PHLrre:
38400 Opc = X86::TCVTROWPS2PHLrre;
38401 break;
38402 case X86::PTILEMOVROWrre:
38403 Opc = X86::TILEMOVROWrre;
38404 break;
38405 }
38406 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38407 MIB.add(MI.getOperand(0));
38408 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38409 MIB.add(MI.getOperand(2));
38410
38411 MI.eraseFromParent(); // The pseudo is gone now.
38412 return BB;
38413 }
38414 }
38415}
38416
38417//===----------------------------------------------------------------------===//
38418// X86 Optimization Hooks
38419//===----------------------------------------------------------------------===//
38420
38421bool
38423 const APInt &DemandedBits,
38424 const APInt &DemandedElts,
38425 TargetLoweringOpt &TLO) const {
38426 EVT VT = Op.getValueType();
38427 unsigned Opcode = Op.getOpcode();
38428 unsigned EltSize = VT.getScalarSizeInBits();
38429
38430 if (VT.isVector()) {
38431 // If the constant is only all signbits in the active bits, then we should
38432 // extend it to the entire constant to allow it act as a boolean constant
38433 // vector.
38434 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
38435 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
38436 return false;
38437 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
38438 if (!DemandedElts[i] || V.getOperand(i).isUndef())
38439 continue;
38440 const APInt &Val = V.getConstantOperandAPInt(i);
38441 if (Val.getBitWidth() > Val.getNumSignBits() &&
38442 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
38443 return true;
38444 }
38445 return false;
38446 };
38447 // For vectors - if we have a constant, then try to sign extend.
38448 // TODO: Handle AND cases.
38449 unsigned ActiveBits = DemandedBits.getActiveBits();
38450 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
38451 (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
38452 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
38453 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
38454 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
38456 SDValue NewC =
38458 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
38459 SDValue NewOp =
38460 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
38461 return TLO.CombineTo(Op, NewOp);
38462 }
38463 return false;
38464 }
38465
38466 // Only optimize Ands to prevent shrinking a constant that could be
38467 // matched by movzx.
38468 if (Opcode != ISD::AND)
38469 return false;
38470
38471 // Make sure the RHS really is a constant.
38472 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
38473 if (!C)
38474 return false;
38475
38476 const APInt &Mask = C->getAPIntValue();
38477
38478 // Clear all non-demanded bits initially.
38479 APInt ShrunkMask = Mask & DemandedBits;
38480
38481 // Find the width of the shrunk mask.
38482 unsigned Width = ShrunkMask.getActiveBits();
38483
38484 // If the mask is all 0s there's nothing to do here.
38485 if (Width == 0)
38486 return false;
38487
38488 // Find the next power of 2 width, rounding up to a byte.
38489 Width = llvm::bit_ceil(std::max(Width, 8U));
38490 // Truncate the width to size to handle illegal types.
38491 Width = std::min(Width, EltSize);
38492
38493 // Calculate a possible zero extend mask for this constant.
38494 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
38495
38496 // If we aren't changing the mask, just return true to keep it and prevent
38497 // the caller from optimizing.
38498 if (ZeroExtendMask == Mask)
38499 return true;
38500
38501 // Make sure the new mask can be represented by a combination of mask bits
38502 // and non-demanded bits.
38503 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
38504 return false;
38505
38506 // Replace the constant with the zero extend mask.
38507 SDLoc DL(Op);
38508 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
38509 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
38510 return TLO.CombineTo(Op, NewOp);
38511}
38512
38514 KnownBits &Known,
38515 const APInt &DemandedElts,
38516 const SelectionDAG &DAG, unsigned Depth) {
38517 KnownBits Known2;
38518 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38519 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38520 Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
38521 Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
38522 Known = KnownBits::abdu(Known, Known2).zext(16);
38523 // Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
38524 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38525 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38526 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38527 Known = Known.zext(64);
38528}
38529
38531 KnownBits &Known,
38532 const APInt &DemandedElts,
38533 const SelectionDAG &DAG,
38534 unsigned Depth) {
38535 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38536
38537 // Multiply signed i16 elements to create i32 values and add Lo/Hi pairs.
38538 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38539 APInt DemandedLoElts =
38540 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38541 APInt DemandedHiElts =
38542 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38543 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38544 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38545 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38546 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38547 KnownBits Lo = KnownBits::mul(LHSLo.sext(32), RHSLo.sext(32));
38548 KnownBits Hi = KnownBits::mul(LHSHi.sext(32), RHSHi.sext(32));
38549 Known = KnownBits::add(Lo, Hi, /*NSW=*/false, /*NUW=*/false);
38550}
38551
38553 KnownBits &Known,
38554 const APInt &DemandedElts,
38555 const SelectionDAG &DAG,
38556 unsigned Depth) {
38557 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38558
38559 // Multiply unsigned/signed i8 elements to create i16 values and add_sat Lo/Hi
38560 // pairs.
38561 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38562 APInt DemandedLoElts =
38563 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38564 APInt DemandedHiElts =
38565 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38566 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38567 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38568 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38569 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38570 KnownBits Lo = KnownBits::mul(LHSLo.zext(16), RHSLo.sext(16));
38571 KnownBits Hi = KnownBits::mul(LHSHi.zext(16), RHSHi.sext(16));
38572 Known = KnownBits::sadd_sat(Lo, Hi);
38573}
38574
38576 const SDValue Op, const APInt &DemandedElts, unsigned Depth,
38577 const SelectionDAG &DAG,
38578 const function_ref<KnownBits(const KnownBits &, const KnownBits &)>
38579 KnownBitsFunc) {
38580 APInt DemandedEltsLHS, DemandedEltsRHS;
38581 getHorizDemandedEltsForFirstOperand(Op.getValueType().getSizeInBits(),
38582 DemandedElts, DemandedEltsLHS,
38583 DemandedEltsRHS);
38584
38585 const auto ComputeForSingleOpFunc =
38586 [&DAG, Depth, KnownBitsFunc](SDValue Op, APInt &DemandedEltsOp) {
38587 return KnownBitsFunc(
38588 DAG.computeKnownBits(Op, DemandedEltsOp, Depth + 1),
38589 DAG.computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1));
38590 };
38591
38592 if (DemandedEltsRHS.isZero())
38593 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS);
38594 if (DemandedEltsLHS.isZero())
38595 return ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS);
38596
38597 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS)
38598 .intersectWith(ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS));
38599}
38600
38602 KnownBits &Known,
38603 const APInt &DemandedElts,
38604 const SelectionDAG &DAG,
38605 unsigned Depth) const {
38606 unsigned BitWidth = Known.getBitWidth();
38607 unsigned NumElts = DemandedElts.getBitWidth();
38608 unsigned Opc = Op.getOpcode();
38609 EVT VT = Op.getValueType();
38614 "Should use MaskedValueIsZero if you don't know whether Op"
38615 " is a target node!");
38616
38617 Known.resetAll();
38618 switch (Opc) {
38619 default: break;
38620 case X86ISD::MUL_IMM: {
38621 KnownBits Known2;
38622 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38623 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38624 Known = KnownBits::mul(Known, Known2);
38625 break;
38626 }
38627 case X86ISD::BSF: {
38629
38630 KnownBits Known2;
38631 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38632 if (Known2.isNonZero()) {
38633 // If we have a known 1, its position is our upper bound.
38634 unsigned PossibleTZ = Known2.countMaxTrailingZeros();
38635 unsigned LowBits = llvm::bit_width(PossibleTZ);
38636 Known.Zero.setBitsFrom(LowBits);
38637 } else if (!Op.getOperand(0).isUndef()) {
38638 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38639 Known = Known.intersectWith(Known2);
38640 }
38641 break;
38642 }
38643 case X86ISD::BSR: {
38644 // TODO: Bound with input known bits?
38646
38647 if (!Op.getOperand(0).isUndef() &&
38648 !DAG.isKnownNeverZero(Op.getOperand(1), Depth + 1)) {
38649 KnownBits Known2;
38650 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38651 Known = Known.intersectWith(Known2);
38652 }
38653 break;
38654 }
38655 case X86ISD::SETCC:
38656 Known.Zero.setBitsFrom(1);
38657 break;
38658 case X86ISD::MOVMSK: {
38659 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
38660 Known.Zero.setBitsFrom(NumLoBits);
38661 break;
38662 }
38663 case X86ISD::PEXTRB:
38664 case X86ISD::PEXTRW: {
38665 SDValue Src = Op.getOperand(0);
38666 EVT SrcVT = Src.getValueType();
38667 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
38668 Op.getConstantOperandVal(1));
38669 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
38670 Known = Known.anyextOrTrunc(BitWidth);
38671 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
38672 break;
38673 }
38674 case X86ISD::VSRAI:
38675 case X86ISD::VSHLI:
38676 case X86ISD::VSRLI: {
38677 unsigned ShAmt = Op.getConstantOperandVal(1);
38678 if (ShAmt >= VT.getScalarSizeInBits()) {
38679 // Out of range logical bit shifts are guaranteed to be zero.
38680 // Out of range arithmetic bit shifts splat the sign bit.
38681 if (Opc != X86ISD::VSRAI) {
38682 Known.setAllZero();
38683 break;
38684 }
38685
38686 ShAmt = VT.getScalarSizeInBits() - 1;
38687 }
38688
38689 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38690 if (Opc == X86ISD::VSHLI) {
38691 Known <<= ShAmt;
38692 // Low bits are known zero.
38693 Known.Zero.setLowBits(ShAmt);
38694 } else if (Opc == X86ISD::VSRLI) {
38695 Known >>= ShAmt;
38696 // High bits are known zero.
38697 Known.Zero.setHighBits(ShAmt);
38698 } else {
38699 Known.Zero.ashrInPlace(ShAmt);
38700 Known.One.ashrInPlace(ShAmt);
38701 }
38702 break;
38703 }
38704 case X86ISD::PACKUS: {
38705 // PACKUS is just a truncation if the upper half is zero.
38706 APInt DemandedLHS, DemandedRHS;
38707 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38708
38709 Known.One = APInt::getAllOnes(BitWidth * 2);
38710 Known.Zero = APInt::getAllOnes(BitWidth * 2);
38711
38712 KnownBits Known2;
38713 if (!!DemandedLHS) {
38714 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38715 Known = Known.intersectWith(Known2);
38716 }
38717 if (!!DemandedRHS) {
38718 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38719 Known = Known.intersectWith(Known2);
38720 }
38721
38722 if (Known.countMinLeadingZeros() < BitWidth)
38723 Known.resetAll();
38724 Known = Known.trunc(BitWidth);
38725 break;
38726 }
38727 case X86ISD::PSHUFB: {
38728 SDValue Src = Op.getOperand(0);
38729 SDValue Idx = Op.getOperand(1);
38730
38731 // If the index vector is never negative (MSB is zero), then all elements
38732 // come from the source vector. This is useful for cases where
38733 // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
38734 // below will handle the more common constant shuffle mask case.
38735 KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);
38736 if (KnownIdx.isNonNegative())
38737 Known = DAG.computeKnownBits(Src, Depth + 1);
38738 break;
38739 }
38740 case X86ISD::VBROADCAST: {
38741 SDValue Src = Op.getOperand(0);
38742 if (!Src.getSimpleValueType().isVector()) {
38743 Known = DAG.computeKnownBits(Src, Depth + 1);
38744 return;
38745 }
38746 break;
38747 }
38748 case X86ISD::AND: {
38749 if (Op.getResNo() == 0) {
38750 KnownBits Known2;
38751 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38752 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38753 Known &= Known2;
38754 }
38755 break;
38756 }
38757 case X86ISD::ANDNP: {
38758 KnownBits Known2;
38759 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38760 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38761
38762 // ANDNP = (~X & Y);
38763 Known.One &= Known2.Zero;
38764 Known.Zero |= Known2.One;
38765 break;
38766 }
38767 case X86ISD::FOR: {
38768 KnownBits Known2;
38769 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38770 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38771
38772 Known |= Known2;
38773 break;
38774 }
38775 case X86ISD::PSADBW: {
38776 SDValue LHS = Op.getOperand(0);
38777 SDValue RHS = Op.getOperand(1);
38778 assert(VT.getScalarType() == MVT::i64 &&
38779 LHS.getValueType() == RHS.getValueType() &&
38780 LHS.getValueType().getScalarType() == MVT::i8 &&
38781 "Unexpected PSADBW types");
38782 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38783 break;
38784 }
38785 case X86ISD::PCMPGT:
38786 case X86ISD::PCMPEQ: {
38787 KnownBits KnownLhs =
38788 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38789 KnownBits KnownRhs =
38790 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38791 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
38792 ? KnownBits::eq(KnownLhs, KnownRhs)
38793 : KnownBits::sgt(KnownLhs, KnownRhs);
38794 if (Res) {
38795 if (*Res)
38796 Known.setAllOnes();
38797 else
38798 Known.setAllZero();
38799 }
38800 break;
38801 }
38802 case X86ISD::VPMADDWD: {
38803 SDValue LHS = Op.getOperand(0);
38804 SDValue RHS = Op.getOperand(1);
38805 assert(VT.getVectorElementType() == MVT::i32 &&
38806 LHS.getValueType() == RHS.getValueType() &&
38807 LHS.getValueType().getVectorElementType() == MVT::i16 &&
38808 "Unexpected PMADDWD types");
38809 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38810 break;
38811 }
38812 case X86ISD::VPMADDUBSW: {
38813 SDValue LHS = Op.getOperand(0);
38814 SDValue RHS = Op.getOperand(1);
38815 assert(VT.getVectorElementType() == MVT::i16 &&
38816 LHS.getValueType() == RHS.getValueType() &&
38817 LHS.getValueType().getVectorElementType() == MVT::i8 &&
38818 "Unexpected PMADDUBSW types");
38819 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38820 break;
38821 }
38822 case X86ISD::PMULUDQ: {
38823 KnownBits Known2;
38824 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38825 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38826
38827 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
38828 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
38829 Known = KnownBits::mul(Known, Known2);
38830 break;
38831 }
38832 case X86ISD::CMOV: {
38833 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
38834 // If we don't know any bits, early out.
38835 if (Known.isUnknown())
38836 break;
38837 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
38838
38839 // Only known if known in both the LHS and RHS.
38840 Known = Known.intersectWith(Known2);
38841 break;
38842 }
38843 case X86ISD::BEXTR:
38844 case X86ISD::BEXTRI: {
38845 SDValue Op0 = Op.getOperand(0);
38846 SDValue Op1 = Op.getOperand(1);
38847
38848 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
38849 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
38850 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
38851
38852 // If the length is 0, the result is 0.
38853 if (Length == 0) {
38854 Known.setAllZero();
38855 break;
38856 }
38857
38858 if ((Shift + Length) <= BitWidth) {
38859 Known = DAG.computeKnownBits(Op0, Depth + 1);
38860 Known = Known.extractBits(Length, Shift);
38861 Known = Known.zextOrTrunc(BitWidth);
38862 }
38863 }
38864 break;
38865 }
38866 case X86ISD::PDEP: {
38867 KnownBits Known2;
38868 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38869 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38870 // Zeros are retained from the mask operand. But not ones.
38871 Known.One.clearAllBits();
38872 // The result will have at least as many trailing zeros as the non-mask
38873 // operand since bits can only map to the same or higher bit position.
38874 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
38875 break;
38876 }
38877 case X86ISD::PEXT: {
38878 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38879 // The result has as many leading zeros as the number of zeroes in the mask.
38880 unsigned Count = Known.Zero.popcount();
38882 Known.One.clearAllBits();
38883 break;
38884 }
38885 case X86ISD::VTRUNC:
38886 case X86ISD::VTRUNCS:
38887 case X86ISD::VTRUNCUS:
38888 case X86ISD::CVTSI2P:
38889 case X86ISD::CVTUI2P:
38890 case X86ISD::CVTP2SI:
38891 case X86ISD::CVTP2UI:
38892 case X86ISD::MCVTP2SI:
38893 case X86ISD::MCVTP2UI:
38894 case X86ISD::CVTTP2SI:
38895 case X86ISD::CVTTP2UI:
38896 case X86ISD::MCVTTP2SI:
38897 case X86ISD::MCVTTP2UI:
38898 case X86ISD::MCVTSI2P:
38899 case X86ISD::MCVTUI2P:
38900 case X86ISD::VFPROUND:
38901 case X86ISD::VMFPROUND:
38902 case X86ISD::CVTPS2PH:
38903 case X86ISD::MCVTPS2PH:
38904 case X86ISD::MCVTTP2SIS:
38905 case X86ISD::MCVTTP2UIS: {
38906 // Truncations/Conversions - upper elements are known zero.
38907 EVT SrcVT = Op.getOperand(0).getValueType();
38908 if (SrcVT.isVector()) {
38909 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38910 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38911 Known.setAllZero();
38912 }
38913 break;
38914 }
38921 // Strict Conversions - upper elements are known zero.
38922 EVT SrcVT = Op.getOperand(1).getValueType();
38923 if (SrcVT.isVector()) {
38924 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38925 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38926 Known.setAllZero();
38927 }
38928 break;
38929 }
38930 case X86ISD::MOVQ2DQ: {
38931 // Move from MMX to XMM. Upper half of XMM should be 0.
38932 if (DemandedElts.countr_zero() >= (NumElts / 2))
38933 Known.setAllZero();
38934 break;
38935 }
38937 APInt UndefElts;
38938 SmallVector<APInt, 16> EltBits;
38939 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
38940 /*AllowWholeUndefs*/ false,
38941 /*AllowPartialUndefs*/ false)) {
38942 Known.Zero.setAllBits();
38943 Known.One.setAllBits();
38944 for (unsigned I = 0; I != NumElts; ++I) {
38945 if (!DemandedElts[I])
38946 continue;
38947 if (UndefElts[I]) {
38948 Known.resetAll();
38949 break;
38950 }
38951 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
38952 Known = Known.intersectWith(Known2);
38953 }
38954 return;
38955 }
38956 break;
38957 }
38958 case X86ISD::HADD:
38959 case X86ISD::HSUB: {
38961 Op, DemandedElts, Depth, DAG,
38962 [Opc](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
38964 /*Add=*/Opc == X86ISD::HADD, /*NSW=*/false, /*NUW=*/false,
38965 KnownLHS, KnownRHS);
38966 });
38967 break;
38968 }
38970 switch (Op->getConstantOperandVal(0)) {
38971 case Intrinsic::x86_sse2_pmadd_wd:
38972 case Intrinsic::x86_avx2_pmadd_wd:
38973 case Intrinsic::x86_avx512_pmaddw_d_512: {
38974 SDValue LHS = Op.getOperand(1);
38975 SDValue RHS = Op.getOperand(2);
38976 assert(VT.getScalarType() == MVT::i32 &&
38977 LHS.getValueType() == RHS.getValueType() &&
38978 LHS.getValueType().getScalarType() == MVT::i16 &&
38979 "Unexpected PMADDWD types");
38980 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38981 break;
38982 }
38983 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
38984 case Intrinsic::x86_avx2_pmadd_ub_sw:
38985 case Intrinsic::x86_avx512_pmaddubs_w_512: {
38986 SDValue LHS = Op.getOperand(1);
38987 SDValue RHS = Op.getOperand(2);
38988 assert(VT.getScalarType() == MVT::i16 &&
38989 LHS.getValueType() == RHS.getValueType() &&
38990 LHS.getValueType().getScalarType() == MVT::i8 &&
38991 "Unexpected PMADDUBSW types");
38992 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38993 break;
38994 }
38995 case Intrinsic::x86_sse2_psad_bw:
38996 case Intrinsic::x86_avx2_psad_bw:
38997 case Intrinsic::x86_avx512_psad_bw_512: {
38998 SDValue LHS = Op.getOperand(1);
38999 SDValue RHS = Op.getOperand(2);
39000 assert(VT.getScalarType() == MVT::i64 &&
39001 LHS.getValueType() == RHS.getValueType() &&
39002 LHS.getValueType().getScalarType() == MVT::i8 &&
39003 "Unexpected PSADBW types");
39004 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
39005 break;
39006 }
39007 }
39008 break;
39009 }
39010 case X86ISD::VPMADD52L:
39011 case X86ISD::VPMADD52H: {
39012 assert(Op.getValueType().isVector() &&
39013 Op.getValueType().getScalarType() == MVT::i64 &&
39014 "Unexpected VPMADD52 type");
39015 KnownBits K0 =
39016 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
39017 KnownBits K1 =
39018 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
39019 KnownBits KAcc =
39020 DAG.computeKnownBits(Op.getOperand(2), DemandedElts, Depth + 1);
39021 K0 = K0.trunc(52);
39022 K1 = K1.trunc(52);
39023 KnownBits KnownMul = (Op.getOpcode() == X86ISD::VPMADD52L)
39024 ? KnownBits::mul(K0, K1)
39025 : KnownBits::mulhu(K0, K1);
39026 KnownMul = KnownMul.zext(64);
39027 Known = KnownBits::add(KAcc, KnownMul);
39028 return;
39029 }
39030 }
39031
39032 // Handle target shuffles.
39033 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39034 if (isTargetShuffle(Opc)) {
39037 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39038 unsigned NumOps = Ops.size();
39039 unsigned NumElts = VT.getVectorNumElements();
39040 if (Mask.size() == NumElts) {
39041 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39042 Known.Zero.setAllBits(); Known.One.setAllBits();
39043 for (unsigned i = 0; i != NumElts; ++i) {
39044 if (!DemandedElts[i])
39045 continue;
39046 int M = Mask[i];
39047 if (M == SM_SentinelUndef) {
39048 // For UNDEF elements, we don't know anything about the common state
39049 // of the shuffle result.
39050 Known.resetAll();
39051 break;
39052 }
39053 if (M == SM_SentinelZero) {
39054 Known.One.clearAllBits();
39055 continue;
39056 }
39057 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39058 "Shuffle index out of range");
39059
39060 unsigned OpIdx = (unsigned)M / NumElts;
39061 unsigned EltIdx = (unsigned)M % NumElts;
39062 if (Ops[OpIdx].getValueType() != VT) {
39063 // TODO - handle target shuffle ops with different value types.
39064 Known.resetAll();
39065 break;
39066 }
39067 DemandedOps[OpIdx].setBit(EltIdx);
39068 }
39069 // Known bits are the values that are shared by every demanded element.
39070 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
39071 if (!DemandedOps[i])
39072 continue;
39073 KnownBits Known2 =
39074 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
39075 Known = Known.intersectWith(Known2);
39076 }
39077 }
39078 }
39079 }
39080}
39081
39083 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
39084 unsigned Depth) const {
39085 EVT VT = Op.getValueType();
39086 unsigned VTBits = VT.getScalarSizeInBits();
39087 unsigned Opcode = Op.getOpcode();
39088 switch (Opcode) {
39090 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
39091 return VTBits;
39092
39093 case X86ISD::VTRUNC: {
39094 SDValue Src = Op.getOperand(0);
39095 MVT SrcVT = Src.getSimpleValueType();
39096 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
39097 assert(VTBits < NumSrcBits && "Illegal truncation input type");
39098 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
39099 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
39100 if (Tmp > (NumSrcBits - VTBits))
39101 return Tmp - (NumSrcBits - VTBits);
39102 return 1;
39103 }
39104
39105 case X86ISD::PACKSS: {
39106 // PACKSS is just a truncation if the sign bits extend to the packed size.
39107 APInt DemandedLHS, DemandedRHS;
39108 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
39109 DemandedRHS);
39110
39111 // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
39112 // patterns often used to compact vXi64 allsignbit patterns.
39113 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
39115 if (BC.getOpcode() == X86ISD::PACKSS &&
39116 BC.getScalarValueSizeInBits() == 16 &&
39117 V.getScalarValueSizeInBits() == 32) {
39120 if (BC0.getScalarValueSizeInBits() == 64 &&
39121 BC1.getScalarValueSizeInBits() == 64 &&
39122 DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
39123 DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
39124 return 32;
39125 }
39126 return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
39127 };
39128
39129 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
39130 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
39131 if (!!DemandedLHS)
39132 Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
39133 if (!!DemandedRHS)
39134 Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
39135 unsigned Tmp = std::min(Tmp0, Tmp1);
39136 if (Tmp > (SrcBits - VTBits))
39137 return Tmp - (SrcBits - VTBits);
39138 return 1;
39139 }
39140
39141 case X86ISD::VBROADCAST: {
39142 SDValue Src = Op.getOperand(0);
39143 if (!Src.getSimpleValueType().isVector())
39144 return DAG.ComputeNumSignBits(Src, Depth + 1);
39145 break;
39146 }
39147
39148 case X86ISD::VSHLI: {
39149 SDValue Src = Op.getOperand(0);
39150 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
39151 if (ShiftVal.uge(VTBits))
39152 return VTBits; // Shifted all bits out --> zero.
39153 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39154 if (ShiftVal.uge(Tmp))
39155 return 1; // Shifted all sign bits out --> unknown.
39156 return Tmp - ShiftVal.getZExtValue();
39157 }
39158
39159 case X86ISD::VSRAI: {
39160 SDValue Src = Op.getOperand(0);
39161 APInt ShiftVal = Op.getConstantOperandAPInt(1);
39162 if (ShiftVal.uge(VTBits - 1))
39163 return VTBits; // Sign splat.
39164 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39165 ShiftVal += Tmp;
39166 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
39167 }
39168
39169 case X86ISD::FSETCC:
39170 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
39171 if (VT == MVT::f32 || VT == MVT::f64 ||
39172 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
39173 return VTBits;
39174 break;
39175
39176 case X86ISD::PCMPGT:
39177 case X86ISD::PCMPEQ:
39178 case X86ISD::CMPP:
39179 case X86ISD::VPCOM:
39180 case X86ISD::VPCOMU:
39181 // Vector compares return zero/all-bits result values.
39182 return VTBits;
39183
39184 case X86ISD::ANDNP: {
39185 unsigned Tmp0 =
39186 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
39187 if (Tmp0 == 1) return 1; // Early out.
39188 unsigned Tmp1 =
39189 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
39190 return std::min(Tmp0, Tmp1);
39191 }
39192
39193 case X86ISD::CMOV: {
39194 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
39195 if (Tmp0 == 1) return 1; // Early out.
39196 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
39197 return std::min(Tmp0, Tmp1);
39198 }
39199 }
39200
39201 // Handle target shuffles.
39202 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39203 if (isTargetShuffle(Opcode)) {
39206 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39207 unsigned NumOps = Ops.size();
39208 unsigned NumElts = VT.getVectorNumElements();
39209 if (Mask.size() == NumElts) {
39210 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39211 for (unsigned i = 0; i != NumElts; ++i) {
39212 if (!DemandedElts[i])
39213 continue;
39214 int M = Mask[i];
39215 if (M == SM_SentinelUndef) {
39216 // For UNDEF elements, we don't know anything about the common state
39217 // of the shuffle result.
39218 return 1;
39219 } else if (M == SM_SentinelZero) {
39220 // Zero = all sign bits.
39221 continue;
39222 }
39223 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39224 "Shuffle index out of range");
39225
39226 unsigned OpIdx = (unsigned)M / NumElts;
39227 unsigned EltIdx = (unsigned)M % NumElts;
39228 if (Ops[OpIdx].getValueType() != VT) {
39229 // TODO - handle target shuffle ops with different value types.
39230 return 1;
39231 }
39232 DemandedOps[OpIdx].setBit(EltIdx);
39233 }
39234 unsigned Tmp0 = VTBits;
39235 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
39236 if (!DemandedOps[i])
39237 continue;
39238 unsigned Tmp1 =
39239 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
39240 Tmp0 = std::min(Tmp0, Tmp1);
39241 }
39242 return Tmp0;
39243 }
39244 }
39245 }
39246
39247 // Fallback case.
39248 return 1;
39249}
39250
39252 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
39253 return N->getOperand(0);
39254 return N;
39255}
39256
39257// Helper to look for a normal load that can be narrowed into a vzload with the
39258// specified VT and memory VT. Returns SDValue() on failure.
39260 SelectionDAG &DAG) {
39261 // Can't if the load is volatile or atomic.
39262 if (!LN->isSimple())
39263 return SDValue();
39264
39265 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39266 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39267 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
39268 LN->getPointerInfo(), LN->getBaseAlign(),
39269 LN->getMemOperand()->getFlags());
39270}
39271
39272// Attempt to match a combined shuffle mask against supported unary shuffle
39273// instructions.
39274// TODO: Investigate sharing more of this with shuffle lowering.
39275static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39276 bool AllowFloatDomain, bool AllowIntDomain,
39277 SDValue V1, const SelectionDAG &DAG,
39278 const X86Subtarget &Subtarget, unsigned &Shuffle,
39279 MVT &SrcVT, MVT &DstVT) {
39280 unsigned NumMaskElts = Mask.size();
39281 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
39282
39283 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
39284 if (Mask[0] == 0 &&
39285 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
39286 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
39288 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
39289 Shuffle = X86ISD::VZEXT_MOVL;
39290 if (MaskEltSize == 16)
39291 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39292 else
39293 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39294 return true;
39295 }
39296 }
39297
39298 // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
39299 if (AllowIntDomain &&
39300 ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
39301 (MaskVT.is256BitVector() && Subtarget.hasInt256()) ||
39302 (MaskVT.is512BitVector() && Subtarget.useAVX512Regs()))) {
39303 unsigned MaxScale = 64 / MaskEltSize;
39304 bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
39305 DAG.ComputeNumSignBits(V1) == MaskEltSize;
39306 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
39307 // Skip 512-bit VPMOV?XBW on non-AVX512BW targets.
39308 if (Scale == 2 && MaskVT == MVT::v64i8 && !Subtarget.useBWIRegs())
39309 continue;
39310 bool MatchAny = true;
39311 bool MatchZero = true;
39312 bool MatchSign = UseSign;
39313 unsigned NumDstElts = NumMaskElts / Scale;
39314 for (unsigned i = 0;
39315 i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
39316 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
39317 MatchAny = MatchSign = MatchZero = false;
39318 break;
39319 }
39320 unsigned Pos = (i * Scale) + 1;
39321 unsigned Len = Scale - 1;
39322 MatchAny &= isUndefInRange(Mask, Pos, Len);
39323 MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
39324 MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
39325 }
39326 if (MatchAny || MatchSign || MatchZero) {
39327 assert((MatchSign || MatchZero) &&
39328 "Failed to match sext/zext but matched aext?");
39329 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
39330 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
39331 : MVT::getIntegerVT(MaskEltSize);
39332 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
39333
39334 Shuffle = unsigned(
39335 MatchAny ? ISD::ANY_EXTEND
39336 : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
39337 if (SrcVT.getVectorNumElements() != NumDstElts)
39338 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
39339
39340 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
39341 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
39342 return true;
39343 }
39344 }
39345 }
39346
39347 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
39348 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
39349 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
39350 isUndefOrEqual(Mask[0], 0) &&
39351 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
39352 Shuffle = X86ISD::VZEXT_MOVL;
39353 if (MaskEltSize == 16)
39354 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39355 else
39356 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39357 return true;
39358 }
39359
39360 // Check if we have SSE3 which will let us use MOVDDUP etc. The
39361 // instructions are no slower than UNPCKLPD but has the option to
39362 // fold the input operand into even an unaligned memory load.
39363 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
39364 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
39365 Shuffle = X86ISD::MOVDDUP;
39366 SrcVT = DstVT = MVT::v2f64;
39367 return true;
39368 }
39369 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39370 Shuffle = X86ISD::MOVSLDUP;
39371 SrcVT = DstVT = MVT::v4f32;
39372 return true;
39373 }
39374 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
39375 Shuffle = X86ISD::MOVSHDUP;
39376 SrcVT = DstVT = MVT::v4f32;
39377 return true;
39378 }
39379 }
39380
39381 if (MaskVT.is256BitVector() && AllowFloatDomain) {
39382 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
39383 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39384 Shuffle = X86ISD::MOVDDUP;
39385 SrcVT = DstVT = MVT::v4f64;
39386 return true;
39387 }
39388 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39389 V1)) {
39390 Shuffle = X86ISD::MOVSLDUP;
39391 SrcVT = DstVT = MVT::v8f32;
39392 return true;
39393 }
39394 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
39395 V1)) {
39396 Shuffle = X86ISD::MOVSHDUP;
39397 SrcVT = DstVT = MVT::v8f32;
39398 return true;
39399 }
39400 }
39401
39402 if (MaskVT.is512BitVector() && AllowFloatDomain) {
39403 assert(Subtarget.hasAVX512() &&
39404 "AVX512 required for 512-bit vector shuffles");
39405 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39406 V1)) {
39407 Shuffle = X86ISD::MOVDDUP;
39408 SrcVT = DstVT = MVT::v8f64;
39409 return true;
39410 }
39412 MaskVT, Mask,
39413 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
39414 Shuffle = X86ISD::MOVSLDUP;
39415 SrcVT = DstVT = MVT::v16f32;
39416 return true;
39417 }
39419 MaskVT, Mask,
39420 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
39421 Shuffle = X86ISD::MOVSHDUP;
39422 SrcVT = DstVT = MVT::v16f32;
39423 return true;
39424 }
39425 }
39426
39427 return false;
39428}
39429
39430// Attempt to match a combined shuffle mask against supported unary immediate
39431// permute instructions.
39432// TODO: Investigate sharing more of this with shuffle lowering.
39434 const APInt &Zeroable,
39435 bool AllowFloatDomain, bool AllowIntDomain,
39436 const SelectionDAG &DAG,
39437 const X86Subtarget &Subtarget,
39438 unsigned &Shuffle, MVT &ShuffleVT,
39439 unsigned &PermuteImm) {
39440 unsigned NumMaskElts = Mask.size();
39441 unsigned InputSizeInBits = MaskVT.getSizeInBits();
39442 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
39443 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
39444 bool ContainsZeros = isAnyZero(Mask);
39445
39446 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
39447 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
39448 // Check for lane crossing permutes.
39449 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
39450 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
39451 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
39452 Shuffle = X86ISD::VPERMI;
39453 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
39454 PermuteImm = getV4X86ShuffleImm(Mask);
39455 return true;
39456 }
39457 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
39458 SmallVector<int, 4> RepeatedMask;
39459 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
39460 Shuffle = X86ISD::VPERMI;
39461 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
39462 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
39463 return true;
39464 }
39465 }
39466 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
39467 // VPERMILPD can permute with a non-repeating shuffle.
39468 Shuffle = X86ISD::VPERMILPI;
39469 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
39470 PermuteImm = 0;
39471 for (int i = 0, e = Mask.size(); i != e; ++i) {
39472 int M = Mask[i];
39473 if (M == SM_SentinelUndef)
39474 continue;
39475 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
39476 PermuteImm |= (M & 1) << i;
39477 }
39478 return true;
39479 }
39480 }
39481
39482 // We are checking for shuffle match or shift match. Loop twice so we can
39483 // order which we try and match first depending on target preference.
39484 for (unsigned Order = 0; Order < 2; ++Order) {
39485 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
39486 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
39487 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
39488 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
39489 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
39490 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
39491 SmallVector<int, 4> RepeatedMask;
39492 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39493 // Narrow the repeated mask to create 32-bit element permutes.
39494 SmallVector<int, 4> WordMask = RepeatedMask;
39495 if (MaskScalarSizeInBits == 64)
39496 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
39497
39498 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
39499 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
39500 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
39501 PermuteImm = getV4X86ShuffleImm(WordMask);
39502 return true;
39503 }
39504 }
39505
39506 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
39507 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
39508 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39509 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39510 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39511 SmallVector<int, 4> RepeatedMask;
39512 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39513 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
39514 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
39515
39516 // PSHUFLW: permute lower 4 elements only.
39517 if (isUndefOrInRange(LoMask, 0, 4) &&
39518 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
39519 Shuffle = X86ISD::PSHUFLW;
39520 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39521 PermuteImm = getV4X86ShuffleImm(LoMask);
39522 return true;
39523 }
39524
39525 // PSHUFHW: permute upper 4 elements only.
39526 if (isUndefOrInRange(HiMask, 4, 8) &&
39527 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
39528 // Offset the HiMask so that we can create the shuffle immediate.
39529 int OffsetHiMask[4];
39530 for (int i = 0; i != 4; ++i)
39531 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
39532
39533 Shuffle = X86ISD::PSHUFHW;
39534 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39535 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
39536 return true;
39537 }
39538 }
39539 }
39540 } else {
39541 // Attempt to match against bit rotates.
39542 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
39543 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
39544 Subtarget.hasAVX512())) {
39545 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
39546 Subtarget, Mask);
39547 if (0 < RotateAmt) {
39548 Shuffle = X86ISD::VROTLI;
39549 PermuteImm = (unsigned)RotateAmt;
39550 return true;
39551 }
39552 }
39553 }
39554 // Attempt to match against byte/bit shifts.
39555 if (AllowIntDomain &&
39556 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39557 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39558 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39559 int ShiftAmt =
39560 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
39561 Zeroable, Subtarget);
39562 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
39563 32 <= ShuffleVT.getScalarSizeInBits())) {
39564 // Byte shifts can be slower so only match them on second attempt.
39565 if (Order == 0 &&
39566 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
39567 continue;
39568
39569 PermuteImm = (unsigned)ShiftAmt;
39570 return true;
39571 }
39572
39573 }
39574 }
39575
39576 return false;
39577}
39578
39579// Attempt to match a combined unary shuffle mask against supported binary
39580// shuffle instructions.
39581// TODO: Investigate sharing more of this with shuffle lowering.
39582static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39583 bool AllowFloatDomain, bool AllowIntDomain,
39584 SDValue &V1, SDValue &V2, const SDLoc &DL,
39585 SelectionDAG &DAG, const X86Subtarget &Subtarget,
39586 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
39587 bool IsUnary) {
39588 unsigned NumMaskElts = Mask.size();
39589 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39590 unsigned SizeInBits = MaskVT.getSizeInBits();
39591
39592 if (MaskVT.is128BitVector()) {
39593 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
39594 AllowFloatDomain) {
39595 V2 = V1;
39596 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
39597 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
39598 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39599 return true;
39600 }
39601 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
39602 AllowFloatDomain) {
39603 V2 = V1;
39604 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
39605 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39606 return true;
39607 }
39608 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
39609 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
39610 std::swap(V1, V2);
39611 Shuffle = X86ISD::MOVSD;
39612 SrcVT = DstVT = MVT::v2f64;
39613 return true;
39614 }
39615 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
39616 (AllowFloatDomain || !Subtarget.hasSSE41())) {
39617 Shuffle = X86ISD::MOVSS;
39618 SrcVT = DstVT = MVT::v4f32;
39619 return true;
39620 }
39621 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
39622 DAG) &&
39623 Subtarget.hasFP16()) {
39624 Shuffle = X86ISD::MOVSH;
39625 SrcVT = DstVT = MVT::v8f16;
39626 return true;
39627 }
39628 }
39629
39630 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
39631 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
39632 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
39633 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
39634 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
39635 Subtarget)) {
39636 DstVT = MaskVT;
39637 return true;
39638 }
39639 }
39640 // TODO: Can we handle this inside matchShuffleWithPACK?
39641 if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&
39642 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
39643 V1.getScalarValueSizeInBits() == 64 &&
39644 V2.getScalarValueSizeInBits() == 64) {
39645 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
39646 unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
39647 unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
39648 if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
39649 SrcVT = MVT::v4i32;
39650 DstVT = MVT::v8i16;
39651 Shuffle = X86ISD::PACKUS;
39652 return true;
39653 }
39654 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
39655 if (MinLZV1 >= 56 && MinLZV2 >= 56) {
39656 SrcVT = MVT::v8i16;
39657 DstVT = MVT::v16i8;
39658 Shuffle = X86ISD::PACKUS;
39659 return true;
39660 }
39661 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
39662 if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
39663 SrcVT = MVT::v4i32;
39664 DstVT = MVT::v8i16;
39665 Shuffle = X86ISD::PACKSS;
39666 return true;
39667 }
39668 }
39669
39670 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
39671 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
39672 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39673 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
39674 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39675 (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
39676 (32 <= EltSizeInBits || Subtarget.hasBWI()))) {
39677 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
39678 Subtarget)) {
39679 SrcVT = DstVT = MaskVT;
39680 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
39681 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
39682 return true;
39683 }
39684 }
39685
39686 // Attempt to match against a OR if we're performing a blend shuffle and the
39687 // non-blended source element is zero in each case.
39688 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
39689 if (SizeInBits == V1.getValueSizeInBits() &&
39690 SizeInBits == V2.getValueSizeInBits() &&
39691 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39692 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
39693 bool IsBlend = true;
39694 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
39695 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
39696 unsigned Scale1 = NumV1Elts / NumMaskElts;
39697 unsigned Scale2 = NumV2Elts / NumMaskElts;
39698 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
39699 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
39700 for (unsigned i = 0; i != NumMaskElts; ++i) {
39701 int M = Mask[i];
39702 if (M == SM_SentinelUndef)
39703 continue;
39704 if (M == SM_SentinelZero) {
39705 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39706 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39707 continue;
39708 }
39709 if (M == (int)i) {
39710 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39711 continue;
39712 }
39713 if (M == (int)(i + NumMaskElts)) {
39714 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39715 continue;
39716 }
39717 IsBlend = false;
39718 break;
39719 }
39720 if (IsBlend) {
39721 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
39722 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
39723 Shuffle = ISD::OR;
39724 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39725 return true;
39726 }
39727 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
39728 // FIXME: handle mismatched sizes?
39729 // TODO: investigate if `ISD::OR` handling in
39730 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
39731 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
39732 unsigned NumElts = V.getValueType().getVectorNumElements();
39733 KnownBits Known(NumElts);
39734 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
39735 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
39736 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
39737 if (PeepholeKnown.isZero())
39738 Known.Zero.setBit(EltIdx);
39739 if (PeepholeKnown.isAllOnes())
39740 Known.One.setBit(EltIdx);
39741 }
39742 return Known;
39743 };
39744
39745 KnownBits V1Known = computeKnownBitsElementWise(V1);
39746 KnownBits V2Known = computeKnownBitsElementWise(V2);
39747
39748 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
39749 int M = Mask[i];
39750 if (M == SM_SentinelUndef)
39751 continue;
39752 if (M == SM_SentinelZero) {
39753 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
39754 continue;
39755 }
39756 if (M == (int)i) {
39757 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
39758 continue;
39759 }
39760 if (M == (int)(i + NumMaskElts)) {
39761 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
39762 continue;
39763 }
39764 llvm_unreachable("will not get here.");
39765 }
39766 if (IsBlend) {
39767 Shuffle = ISD::OR;
39768 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39769 return true;
39770 }
39771 }
39772 }
39773 }
39774
39775 return false;
39776}
39777
39779 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
39780 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
39781 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
39782 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
39783 unsigned NumMaskElts = Mask.size();
39784 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39785
39786 // Attempt to match against VALIGND/VALIGNQ rotate.
39787 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
39788 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
39789 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
39790 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39791 MVT AlignVT = MVT::getVectorVT(MVT::getIntegerVT(EltSizeInBits),
39792 MaskVT.getSizeInBits() / EltSizeInBits);
39793 if (!isAnyZero(Mask)) {
39794 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
39795 if (0 < Rotation) {
39796 Shuffle = X86ISD::VALIGN;
39797 ShuffleVT = AlignVT;
39798 PermuteImm = Rotation;
39799 return true;
39800 }
39801 }
39802 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
39803 unsigned ZeroLo = Zeroable.countr_one();
39804 unsigned ZeroHi = Zeroable.countl_one();
39805 assert((ZeroLo + ZeroHi) < NumMaskElts && "Zeroable shuffle detected");
39806 if (ZeroLo) {
39807 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39808 std::iota(ShiftMask.begin() + ZeroLo, ShiftMask.end(), 0);
39809 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39810 V2 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39811 Shuffle = X86ISD::VALIGN;
39812 ShuffleVT = AlignVT;
39813 PermuteImm = NumMaskElts - ZeroLo;
39814 return true;
39815 }
39816 }
39817 if (ZeroHi) {
39818 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39819 std::iota(ShiftMask.begin(), ShiftMask.begin() + NumMaskElts - ZeroHi,
39820 ZeroHi);
39821 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39822 V2 = V1;
39823 V1 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39824 Shuffle = X86ISD::VALIGN;
39825 ShuffleVT = AlignVT;
39826 PermuteImm = ZeroHi;
39827 return true;
39828 }
39829 }
39830 }
39831
39832 // Attempt to match against PALIGNR byte rotate.
39833 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
39834 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39835 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39836 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
39837 if (0 < ByteRotation) {
39838 Shuffle = X86ISD::PALIGNR;
39839 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
39840 PermuteImm = ByteRotation;
39841 return true;
39842 }
39843 }
39844
39845 // Attempt to combine to X86ISD::BLENDI.
39846 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
39847 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
39848 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
39849 uint64_t BlendMask = 0;
39850 bool ForceV1Zero = false, ForceV2Zero = false;
39851 SmallVector<int, 8> TargetMask(Mask);
39852 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
39853 ForceV2Zero, BlendMask)) {
39854 if (MaskVT == MVT::v16i16) {
39855 // We can only use v16i16 PBLENDW if the lanes are repeated.
39856 SmallVector<int, 8> RepeatedMask;
39857 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
39858 RepeatedMask)) {
39859 assert(RepeatedMask.size() == 8 &&
39860 "Repeated mask size doesn't match!");
39861 PermuteImm = 0;
39862 for (int i = 0; i < 8; ++i)
39863 if (RepeatedMask[i] >= 8)
39864 PermuteImm |= 1 << i;
39865 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39866 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39867 Shuffle = X86ISD::BLENDI;
39868 ShuffleVT = MaskVT;
39869 return true;
39870 }
39871 } else {
39872 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39873 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39874 PermuteImm = (unsigned)BlendMask;
39875 Shuffle = X86ISD::BLENDI;
39876 ShuffleVT = MaskVT;
39877 return true;
39878 }
39879 }
39880 }
39881
39882 // Attempt to combine to INSERTPS, but only if it has elements that need to
39883 // be set to zero.
39884 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39885 MaskVT.is128BitVector() && isAnyZero(Mask) &&
39886 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39887 Shuffle = X86ISD::INSERTPS;
39888 ShuffleVT = MVT::v4f32;
39889 return true;
39890 }
39891
39892 // Attempt to combine to SHUFPD.
39893 if (AllowFloatDomain && EltSizeInBits == 64 &&
39894 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39895 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39896 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39897 bool ForceV1Zero = false, ForceV2Zero = false;
39898 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
39899 PermuteImm, Mask, Zeroable)) {
39900 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39901 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39902 Shuffle = X86ISD::SHUFP;
39903 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
39904 return true;
39905 }
39906 }
39907
39908 // Attempt to combine to SHUFPS.
39909 if (AllowFloatDomain && EltSizeInBits == 32 &&
39910 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
39911 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39912 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39913 SmallVector<int, 4> RepeatedMask;
39914 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
39915 // Match each half of the repeated mask, to determine if its just
39916 // referencing one of the vectors, is zeroable or entirely undef.
39917 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
39918 int M0 = RepeatedMask[Offset];
39919 int M1 = RepeatedMask[Offset + 1];
39920
39921 if (isUndefInRange(RepeatedMask, Offset, 2)) {
39922 return DAG.getUNDEF(MaskVT);
39923 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
39924 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
39925 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
39926 return getZeroVector(MaskVT, Subtarget, DAG, DL);
39927 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
39928 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39929 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39930 return V1;
39931 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
39932 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39933 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39934 return V2;
39935 }
39936
39937 return SDValue();
39938 };
39939
39940 int ShufMask[4] = {-1, -1, -1, -1};
39941 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
39942 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
39943
39944 if (Lo && Hi) {
39945 V1 = Lo;
39946 V2 = Hi;
39947 Shuffle = X86ISD::SHUFP;
39948 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
39949 PermuteImm = getV4X86ShuffleImm(ShufMask);
39950 return true;
39951 }
39952 }
39953 }
39954
39955 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
39956 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39957 MaskVT.is128BitVector() &&
39958 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39959 Shuffle = X86ISD::INSERTPS;
39960 ShuffleVT = MVT::v4f32;
39961 return true;
39962 }
39963
39964 return false;
39965}
39966
39968 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
39969 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39970 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39971 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39972 const X86Subtarget &Subtarget);
39973
39974/// Combine an arbitrary chain of shuffles into a single instruction if
39975/// possible.
39976///
39977/// This is the leaf of the recursive combine below. When we have found some
39978/// chain of single-use x86 shuffle instructions and accumulated the combined
39979/// shuffle mask represented by them, this will try to pattern match that mask
39980/// into either a single instruction if there is a special purpose instruction
39981/// for this operation, or into a PSHUFB instruction which is a fully general
39982/// instruction but should only be used to replace chains over a certain depth.
39984 ArrayRef<SDValue> Inputs, unsigned RootOpc, MVT RootVT,
39985 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39986 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39987 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39988 const X86Subtarget &Subtarget) {
39989 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
39990 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
39991 "Unexpected number of shuffle inputs!");
39992 unsigned RootSizeInBits = RootVT.getSizeInBits();
39993 unsigned NumRootElts = RootVT.getVectorNumElements();
39994
39995 // Canonicalize shuffle input op to the requested type.
39996 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
39997 if (VT.getSizeInBits() > Op.getValueSizeInBits())
39998 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
39999 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
40000 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
40001 return DAG.getBitcast(VT, Op);
40002 };
40003
40004 // Find the inputs that enter the chain. Note that multiple uses are OK
40005 // here, we're not going to remove the operands we find.
40006 bool UnaryShuffle = (Inputs.size() == 1);
40007 SDValue V1 = peekThroughBitcasts(Inputs[0]);
40008 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
40009 : peekThroughBitcasts(Inputs[1]));
40010
40011 MVT VT1 = V1.getSimpleValueType();
40012 MVT VT2 = V2.getSimpleValueType();
40013 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
40014 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
40015
40016 SDValue Res;
40017
40018 unsigned NumBaseMaskElts = BaseMask.size();
40019 if (NumBaseMaskElts == 1) {
40020 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
40021 return CanonicalizeShuffleInput(RootVT, V1);
40022 }
40023
40024 bool OptForSize = DAG.shouldOptForSize();
40025 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
40026 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
40027 (RootVT.isFloatingPoint() && Depth >= 1) ||
40028 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
40029
40030 // If we are shuffling a splat (and not introducing zeros) then we can just
40031 // use it directly. This works for smaller elements as well as they already
40032 // repeat across each mask element.
40033 if (UnaryShuffle && !isAnyZero(BaseMask) &&
40034 V1.getValueSizeInBits() >= RootSizeInBits &&
40035 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
40036 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
40037 return CanonicalizeShuffleInput(RootVT, V1);
40038 }
40039
40040 SmallVector<int, 64> Mask(BaseMask);
40041
40042 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
40043 // etc. can be simplified.
40044 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
40045 SmallVector<int> ScaledMask, IdentityMask;
40046 unsigned NumElts = VT1.getVectorNumElements();
40047 if (Mask.size() <= NumElts &&
40048 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
40049 for (unsigned i = 0; i != NumElts; ++i)
40050 IdentityMask.push_back(i);
40051 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
40052 V2))
40053 return CanonicalizeShuffleInput(RootVT, V1);
40054 }
40055 }
40056
40057 // Handle 128/256-bit lane shuffles of 512-bit vectors.
40058 if (RootVT.is512BitVector() &&
40059 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
40060 // If the upper subvectors are zeroable, then an extract+insert is more
40061 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
40062 // to zero the upper subvectors.
40063 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
40064 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40065 return SDValue(); // Nothing to do!
40066 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
40067 "Unexpected lane shuffle");
40068 Res = CanonicalizeShuffleInput(RootVT, V1);
40069 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
40070 bool UseZero = isAnyZero(Mask);
40071 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
40072 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
40073 }
40074
40075 // Narrow shuffle mask to v4x128.
40076 SmallVector<int, 4> ScaledMask;
40077 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
40078 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
40079
40080 // Try to lower to vshuf64x2/vshuf32x4.
40081 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
40082 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
40083 SelectionDAG &DAG) {
40084 int PermMask[4] = {-1, -1, -1, -1};
40085 // Ensure elements came from the same Op.
40086 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
40087 for (int i = 0; i < 4; ++i) {
40088 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
40089 if (ScaledMask[i] < 0)
40090 continue;
40091
40092 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
40093 unsigned OpIndex = i / 2;
40094 if (Ops[OpIndex].isUndef())
40095 Ops[OpIndex] = Op;
40096 else if (Ops[OpIndex] != Op)
40097 return SDValue();
40098
40099 PermMask[i] = ScaledMask[i] % 4;
40100 }
40101
40102 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
40103 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
40104 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
40105 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
40106 };
40107
40108 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
40109 // doesn't work because our mask is for 128 bits and we don't have an MVT
40110 // to match that.
40111 bool PreferPERMQ = UnaryShuffle && !isFreeToSplitVector(V1, DAG) &&
40112 isUndefOrInRange(ScaledMask[0], 0, 2) &&
40113 isUndefOrInRange(ScaledMask[1], 0, 2) &&
40114 isUndefOrInRange(ScaledMask[2], 2, 4) &&
40115 isUndefOrInRange(ScaledMask[3], 2, 4) &&
40116 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
40117 ScaledMask[0] == (ScaledMask[2] % 2)) &&
40118 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
40119 ScaledMask[1] == (ScaledMask[3] % 2));
40120
40121 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
40122 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40123 return SDValue(); // Nothing to do!
40124 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
40125 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
40126 return DAG.getBitcast(RootVT, V);
40127 }
40128 }
40129
40130 // Handle 128-bit lane shuffles of 256-bit vectors.
40131 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
40132 // If the upper half is zeroable, then an extract+insert is more optimal
40133 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
40134 // zero the upper half.
40135 if (isUndefOrZero(Mask[1])) {
40136 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40137 return SDValue(); // Nothing to do!
40138 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
40139 Res = CanonicalizeShuffleInput(RootVT, V1);
40140 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
40141 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
40142 256);
40143 }
40144
40145 // If we're inserting the low subvector, an insert-subvector 'concat'
40146 // pattern is quicker than VPERM2X128.
40147 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
40148 !Subtarget.hasAVX2()) {
40149 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40150 return SDValue(); // Nothing to do!
40151 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
40152 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
40153 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
40154 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
40155 }
40156
40157 // Don't lower to VPERM2X128 here if we have AVX2+, prefer to use
40158 // VPERMQ/VPERMPD for unary shuffles unless we need to use the zeroing
40159 // feature.
40160 // Prefer blends for sequential shuffles unless we are optimizing for size.
40161 if (UnaryShuffle &&
40162 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
40163 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
40164 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40165 return SDValue(); // Nothing to do!
40166 unsigned PermMask = 0;
40167 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
40168 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
40169 return DAG.getNode(
40170 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
40171 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
40172 }
40173
40174 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40175 return SDValue(); // Nothing to do!
40176
40177 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
40178 if (!UnaryShuffle && !IsMaskedShuffle) {
40179 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
40180 "Unexpected shuffle sentinel value");
40181 // Prefer blends to X86ISD::VPERM2X128.
40182 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
40183 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40184 return SDValue(); // Nothing to do!
40185 unsigned PermMask = 0;
40186 PermMask |= ((Mask[0] & 3) << 0);
40187 PermMask |= ((Mask[1] & 3) << 4);
40188 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
40189 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
40190 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
40191 CanonicalizeShuffleInput(RootVT, LHS),
40192 CanonicalizeShuffleInput(RootVT, RHS),
40193 DAG.getTargetConstant(PermMask, DL, MVT::i8));
40194 }
40195 }
40196 }
40197
40198 // For masks that have been widened to 128-bit elements or more,
40199 // narrow back down to 64-bit elements.
40200 if (BaseMaskEltSizeInBits > 64) {
40201 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
40202 int MaskScale = BaseMaskEltSizeInBits / 64;
40203 SmallVector<int, 64> ScaledMask;
40204 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40205 Mask = std::move(ScaledMask);
40206 }
40207
40208 // For masked shuffles, we're trying to match the root width for better
40209 // writemask folding, attempt to scale the mask.
40210 // TODO - variable shuffles might need this to be widened again.
40211 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
40212 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
40213 int MaskScale = NumRootElts / Mask.size();
40214 SmallVector<int, 64> ScaledMask;
40215 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40216 Mask = std::move(ScaledMask);
40217 }
40218
40219 unsigned NumMaskElts = Mask.size();
40220 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
40221 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40222
40223 // Determine the effective mask value type.
40224 FloatDomain &= (32 <= MaskEltSizeInBits);
40225 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
40226 : MVT::getIntegerVT(MaskEltSizeInBits);
40227 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
40228
40229 // Only allow legal mask types.
40230 if (!TLI.isTypeLegal(MaskVT))
40231 return SDValue();
40232
40233 // Attempt to match the mask against known shuffle patterns.
40234 MVT ShuffleSrcVT, ShuffleVT;
40235 unsigned Shuffle, PermuteImm;
40236
40237 // Which shuffle domains are permitted?
40238 // Permit domain crossing at higher combine depths.
40239 // TODO: Should we indicate which domain is preferred if both are allowed?
40240 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
40241 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
40242 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
40243
40244 // Determine zeroable mask elements.
40245 APInt KnownUndef, KnownZero;
40246 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
40247 APInt Zeroable = KnownUndef | KnownZero;
40248
40249 if (UnaryShuffle) {
40250 // Attempt to match against broadcast-from-vector.
40251 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
40252 if ((Subtarget.hasAVX2() ||
40253 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
40254 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
40255 if (isUndefOrEqual(Mask, 0)) {
40256 if (V1.getValueType() == MaskVT &&
40258 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
40259 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40260 return SDValue(); // Nothing to do!
40261 Res = V1.getOperand(0);
40262 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40263 return DAG.getBitcast(RootVT, Res);
40264 }
40265 if (Subtarget.hasAVX2()) {
40266 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40267 return SDValue(); // Nothing to do!
40268 Res = CanonicalizeShuffleInput(MaskVT, V1);
40269 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40270 return DAG.getBitcast(RootVT, Res);
40271 }
40272 }
40273 }
40274
40275 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
40276 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
40277 (!IsMaskedShuffle ||
40278 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40279 if (Depth == 0 && RootOpc == Shuffle)
40280 return SDValue(); // Nothing to do!
40281 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40282 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
40283 return DAG.getBitcast(RootVT, Res);
40284 }
40285
40286 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40287 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
40288 PermuteImm) &&
40289 (!IsMaskedShuffle ||
40290 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40291 if (Depth == 0 && RootOpc == Shuffle)
40292 return SDValue(); // Nothing to do!
40293 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
40294 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
40295 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40296 return DAG.getBitcast(RootVT, Res);
40297 }
40298 }
40299
40300 // Attempt to combine to INSERTPS, but only if the inserted element has come
40301 // from a scalar.
40302 // TODO: Handle other insertions here as well?
40303 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
40304 Subtarget.hasSSE41() &&
40305 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
40306 if (MaskEltSizeInBits == 32) {
40307 SDValue SrcV1 = V1, SrcV2 = V2;
40308 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
40309 DAG) &&
40310 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
40311 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40312 return SDValue(); // Nothing to do!
40313 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40314 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
40315 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
40316 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40317 return DAG.getBitcast(RootVT, Res);
40318 }
40319 }
40320 if (MaskEltSizeInBits == 64 &&
40321 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
40323 V2.getScalarValueSizeInBits() <= 32) {
40324 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40325 return SDValue(); // Nothing to do!
40326 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
40327 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40328 CanonicalizeShuffleInput(MVT::v4f32, V1),
40329 CanonicalizeShuffleInput(MVT::v4f32, V2),
40330 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40331 return DAG.getBitcast(RootVT, Res);
40332 }
40333 }
40334
40335 SDValue NewV1 = V1; // Save operands in case early exit happens.
40336 SDValue NewV2 = V2;
40337 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
40338 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
40339 ShuffleVT, UnaryShuffle) &&
40340 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40341 if (Depth == 0 && RootOpc == Shuffle)
40342 return SDValue(); // Nothing to do!
40343 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
40344 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
40345 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
40346 return DAG.getBitcast(RootVT, Res);
40347 }
40348
40349 NewV1 = V1; // Save operands in case early exit happens.
40350 NewV2 = V2;
40351 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40352 AllowIntDomain, NewV1, NewV2, DL, DAG,
40353 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
40354 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40355 if (Depth == 0 && RootOpc == Shuffle)
40356 return SDValue(); // Nothing to do!
40357 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
40358 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
40359 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
40360 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40361 return DAG.getBitcast(RootVT, Res);
40362 }
40363
40364 // Typically from here on, we need an integer version of MaskVT.
40365 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
40366 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
40367
40368 // Annoyingly, SSE4A instructions don't map into the above match helpers.
40369 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
40370 uint64_t BitLen, BitIdx;
40371 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
40372 Zeroable)) {
40373 if (Depth == 0 && RootOpc == X86ISD::EXTRQI)
40374 return SDValue(); // Nothing to do!
40375 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40376 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
40377 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40378 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40379 return DAG.getBitcast(RootVT, Res);
40380 }
40381
40382 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
40383 if (Depth == 0 && RootOpc == X86ISD::INSERTQI)
40384 return SDValue(); // Nothing to do!
40385 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40386 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
40387 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
40388 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40389 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40390 return DAG.getBitcast(RootVT, Res);
40391 }
40392 }
40393
40394 // Match shuffle against TRUNCATE patterns.
40395 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
40396 // Match against a VTRUNC instruction, accounting for src/dst sizes.
40397 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
40398 Subtarget)) {
40399 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
40400 ShuffleSrcVT.getVectorNumElements();
40401 unsigned Opc =
40403 if (Depth == 0 && RootOpc == Opc)
40404 return SDValue(); // Nothing to do!
40405 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40406 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
40407 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
40408 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
40409 return DAG.getBitcast(RootVT, Res);
40410 }
40411
40412 // Do we need a more general binary truncation pattern?
40413 if (RootSizeInBits < 512 &&
40414 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
40415 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
40416 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
40417 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
40418 // Bail if this was already a truncation or PACK node.
40419 // We sometimes fail to match PACK if we demand known undef elements.
40420 if (Depth == 0 &&
40421 (RootOpc == ISD::TRUNCATE || RootOpc == X86ISD::PACKSS ||
40422 RootOpc == X86ISD::PACKUS))
40423 return SDValue(); // Nothing to do!
40424 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40425 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
40426 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40427 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
40428 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40429 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
40430 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
40431 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
40432 return DAG.getBitcast(RootVT, Res);
40433 }
40434 }
40435
40436 // Don't try to re-form single instruction chains under any circumstances now
40437 // that we've done encoding canonicalization for them.
40438 if (Depth < 1)
40439 return SDValue();
40440
40441 int NumVariableMasks = llvm::count_if(SrcNodes, [](const SDNode *N) {
40442 return isTargetShuffleVariableMask(N->getOpcode());
40443 });
40444 bool HasSlowVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
40445 return (N->getOpcode() == X86ISD::VPERMV3 ||
40446 N->getOpcode() == X86ISD::VPERMV);
40447 });
40448
40449 // Depth threshold above which we can efficiently use variable mask shuffles.
40450 int VariableCrossLaneShuffleDepth =
40451 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
40452 int VariablePerLaneShuffleDepth =
40453 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
40454 AllowVariableCrossLaneMask &=
40455 (Depth >= VariableCrossLaneShuffleDepth) || NumVariableMasks;
40456 AllowVariablePerLaneMask &=
40457 (Depth >= VariablePerLaneShuffleDepth) || NumVariableMasks;
40458 // VPERM2W/VPERM2B are 3 uops on Skylake and Icelake so we require a
40459 // higher depth before combining them.
40460 int BWIVPERMV3ShuffleDepth =
40461 VariableCrossLaneShuffleDepth + 2 - NumVariableMasks;
40462 bool AllowBWIVPERMV3 =
40463 (Depth >= BWIVPERMV3ShuffleDepth || HasSlowVariableMask);
40464
40465 // If root was a VPERMV/VPERMV3 node, always allow a variable shuffle.
40466 if ((UnaryShuffle && RootOpc == X86ISD::VPERMV) || RootOpc == X86ISD::VPERMV3)
40467 AllowVariableCrossLaneMask = AllowVariablePerLaneMask = true;
40468
40469 bool MaskContainsZeros = isAnyZero(Mask);
40470
40471 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
40472 // If we have a single input lane-crossing shuffle then lower to VPERMV.
40473 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
40474 if (Subtarget.hasAVX2() &&
40475 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
40476 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
40477 Res = CanonicalizeShuffleInput(MaskVT, V1);
40478 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
40479 return DAG.getBitcast(RootVT, Res);
40480 }
40481 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
40482 if ((Subtarget.hasAVX512() &&
40483 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40484 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40485 (Subtarget.hasBWI() &&
40486 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40487 (Subtarget.hasVBMI() &&
40488 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
40489 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40490 V2 = DAG.getUNDEF(MaskVT);
40491 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40492 return DAG.getBitcast(RootVT, Res);
40493 }
40494 }
40495
40496 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
40497 // vector as the second source (non-VLX will pad to 512-bit shuffles).
40498 if (UnaryShuffle && AllowVariableCrossLaneMask &&
40499 ((Subtarget.hasAVX512() &&
40500 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40501 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40502 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
40503 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40504 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40505 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40506 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40507 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40508 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
40509 for (unsigned i = 0; i != NumMaskElts; ++i)
40510 if (Mask[i] == SM_SentinelZero)
40511 Mask[i] = NumMaskElts + i;
40512 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40513 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
40514 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40515 return DAG.getBitcast(RootVT, Res);
40516 }
40517
40518 // If that failed and either input is extracted then try to combine as a
40519 // shuffle with the larger type.
40521 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40522 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40523 IsMaskedShuffle, DAG, DL, Subtarget))
40524 return WideShuffle;
40525
40526 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40527 // (non-VLX will pad to 512-bit shuffles).
40528 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
40529 ((Subtarget.hasAVX512() &&
40530 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40531 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40532 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
40533 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
40534 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40535 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40536 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40537 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40538 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40539 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40540 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40541 return DAG.getBitcast(RootVT, Res);
40542 }
40543 return SDValue();
40544 }
40545
40546 // See if we can combine a single input shuffle with zeros to a bit-mask,
40547 // which is much simpler than any shuffle.
40548 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
40549 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
40550 TLI.isTypeLegal(MaskVT)) {
40551 APInt Zero = APInt::getZero(MaskEltSizeInBits);
40552 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
40553 APInt UndefElts(NumMaskElts, 0);
40554 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
40555 for (unsigned i = 0; i != NumMaskElts; ++i) {
40556 int M = Mask[i];
40557 if (M == SM_SentinelUndef) {
40558 UndefElts.setBit(i);
40559 continue;
40560 }
40561 if (M == SM_SentinelZero)
40562 continue;
40563 EltBits[i] = AllOnes;
40564 }
40565 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
40566 Res = CanonicalizeShuffleInput(MaskVT, V1);
40567 unsigned AndOpcode =
40569 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
40570 return DAG.getBitcast(RootVT, Res);
40571 }
40572
40573 // If we have a single input shuffle with different shuffle patterns in the
40574 // the 128-bit lanes use the variable mask to VPERMILPS.
40575 // TODO Combine other mask types at higher depths.
40576 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40577 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
40578 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
40579 SmallVector<SDValue, 16> VPermIdx;
40580 for (int M : Mask) {
40581 SDValue Idx =
40582 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
40583 VPermIdx.push_back(Idx);
40584 }
40585 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
40586 Res = CanonicalizeShuffleInput(MaskVT, V1);
40587 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
40588 return DAG.getBitcast(RootVT, Res);
40589 }
40590
40591 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
40592 // to VPERMIL2PD/VPERMIL2PS.
40593 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
40594 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
40595 MaskVT == MVT::v8f32)) {
40596 // VPERMIL2 Operation.
40597 // Bits[3] - Match Bit.
40598 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
40599 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
40600 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
40601 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
40602 SmallVector<int, 8> VPerm2Idx;
40603 unsigned M2ZImm = 0;
40604 for (int M : Mask) {
40605 if (M == SM_SentinelUndef) {
40606 VPerm2Idx.push_back(-1);
40607 continue;
40608 }
40609 if (M == SM_SentinelZero) {
40610 M2ZImm = 2;
40611 VPerm2Idx.push_back(8);
40612 continue;
40613 }
40614 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
40615 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
40616 VPerm2Idx.push_back(Index);
40617 }
40618 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40619 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40620 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
40621 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
40622 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
40623 return DAG.getBitcast(RootVT, Res);
40624 }
40625
40626 // If we have 3 or more shuffle instructions or a chain involving a variable
40627 // mask, we can replace them with a single PSHUFB instruction profitably.
40628 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
40629 // instructions, but in practice PSHUFB tends to be *very* fast so we're
40630 // more aggressive.
40631 if (UnaryShuffle && AllowVariablePerLaneMask &&
40632 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
40633 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
40634 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
40635 SmallVector<SDValue, 16> PSHUFBMask;
40636 int NumBytes = RootVT.getSizeInBits() / 8;
40637 int Ratio = NumBytes / NumMaskElts;
40638 for (int i = 0; i < NumBytes; ++i) {
40639 int M = Mask[i / Ratio];
40640 if (M == SM_SentinelUndef) {
40641 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
40642 continue;
40643 }
40644 if (M == SM_SentinelZero) {
40645 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40646 continue;
40647 }
40648 M = Ratio * M + i % Ratio;
40649 assert((M / 16) == (i / 16) && "Lane crossing detected");
40650 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40651 }
40652 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
40653 Res = CanonicalizeShuffleInput(ByteVT, V1);
40654 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
40655 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
40656 return DAG.getBitcast(RootVT, Res);
40657 }
40658
40659 // With XOP, if we have a 128-bit binary input shuffle we can always combine
40660 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
40661 // slower than PSHUFB on targets that support both.
40662 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
40663 Subtarget.hasXOP()) {
40664 // VPPERM Mask Operation
40665 // Bits[4:0] - Byte Index (0 - 31)
40666 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
40667 SmallVector<SDValue, 16> VPPERMMask;
40668 int NumBytes = 16;
40669 int Ratio = NumBytes / NumMaskElts;
40670 for (int i = 0; i < NumBytes; ++i) {
40671 int M = Mask[i / Ratio];
40672 if (M == SM_SentinelUndef) {
40673 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
40674 continue;
40675 }
40676 if (M == SM_SentinelZero) {
40677 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40678 continue;
40679 }
40680 M = Ratio * M + i % Ratio;
40681 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40682 }
40683 MVT ByteVT = MVT::v16i8;
40684 V1 = CanonicalizeShuffleInput(ByteVT, V1);
40685 V2 = CanonicalizeShuffleInput(ByteVT, V2);
40686 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
40687 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
40688 return DAG.getBitcast(RootVT, Res);
40689 }
40690
40691 // If that failed and either input is extracted then try to combine as a
40692 // shuffle with the larger type.
40694 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40695 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
40696 DAG, DL, Subtarget))
40697 return WideShuffle;
40698
40699 // If we have a dual input shuffle then lower to VPERMV3,
40700 // (non-VLX will pad to 512-bit shuffles)
40701 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40702 ((Subtarget.hasAVX512() &&
40703 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
40704 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
40705 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
40706 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
40707 MaskVT == MVT::v16i32)) ||
40708 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40709 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
40710 MaskVT == MVT::v32i16)) ||
40711 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40712 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
40713 MaskVT == MVT::v64i8)))) {
40714 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40715 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40716 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40717 return DAG.getBitcast(RootVT, Res);
40718 }
40719
40720 // Failed to find any combines.
40721 return SDValue();
40722}
40723
40724// Combine an arbitrary chain of shuffles + extract_subvectors into a single
40725// instruction if possible.
40726//
40727// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40728// type size to attempt to combine:
40729// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40730// -->
40731// extract_subvector(shuffle(x,y,m2),0)
40733 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
40734 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
40735 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
40736 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
40737 const X86Subtarget &Subtarget) {
40738 unsigned NumMaskElts = BaseMask.size();
40739 unsigned NumInputs = Inputs.size();
40740 if (NumInputs == 0)
40741 return SDValue();
40742
40743 unsigned RootSizeInBits = RootVT.getSizeInBits();
40744 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40745 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
40746
40747 // Peek through subvectors to find widest legal vector.
40748 // TODO: Handle ISD::TRUNCATE
40749 unsigned WideSizeInBits = RootSizeInBits;
40750 for (SDValue Input : Inputs) {
40752 while (1) {
40753 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
40754 Input = peekThroughBitcasts(Input.getOperand(0));
40755 continue;
40756 }
40757 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40758 Input.getOperand(0).isUndef() &&
40759 isNullConstant(Input.getOperand(2))) {
40760 Input = peekThroughBitcasts(Input.getOperand(1));
40761 continue;
40762 }
40763 break;
40764 }
40765 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40766 WideSizeInBits < Input.getValueSizeInBits())
40767 WideSizeInBits = Input.getValueSizeInBits();
40768 }
40769
40770 // Bail if we fail to find a source larger than the existing root.
40771 if (WideSizeInBits <= RootSizeInBits ||
40772 (WideSizeInBits % RootSizeInBits) != 0)
40773 return SDValue();
40774
40775 // Create new mask for larger type.
40776 SmallVector<int, 64> WideMask;
40777 growShuffleMask(BaseMask, WideMask, RootSizeInBits, WideSizeInBits);
40778
40779 // Attempt to peek through inputs and adjust mask when we extract from an
40780 // upper subvector.
40781 int AdjustedMasks = 0;
40782 SmallVector<SDValue, 4> WideInputs(Inputs);
40783 for (unsigned I = 0; I != NumInputs; ++I) {
40784 SDValue &Input = WideInputs[I];
40786 while (1) {
40787 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40788 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40789 uint64_t Idx = Input.getConstantOperandVal(1);
40790 if (Idx != 0) {
40791 ++AdjustedMasks;
40792 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40793 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40794
40795 int lo = I * WideMask.size();
40796 int hi = (I + 1) * WideMask.size();
40797 for (int &M : WideMask)
40798 if (lo <= M && M < hi)
40799 M += Idx;
40800 }
40801 Input = peekThroughBitcasts(Input.getOperand(0));
40802 continue;
40803 }
40804 // TODO: Handle insertions into upper subvectors.
40805 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40806 Input.getOperand(0).isUndef() &&
40807 isNullConstant(Input.getOperand(2))) {
40808 Input = peekThroughBitcasts(Input.getOperand(1));
40809 continue;
40810 }
40811 break;
40812 }
40813 }
40814
40815 // Remove unused/repeated shuffle source ops.
40816 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40817 assert(!WideInputs.empty() && "Shuffle with no inputs detected");
40818
40819 // Bail if we're always extracting from the lowest subvectors,
40820 // combineX86ShuffleChain should match this for the current width, or the
40821 // shuffle still references too many inputs.
40822 if (AdjustedMasks == 0 || WideInputs.size() > 2)
40823 return SDValue();
40824
40825 // Minor canonicalization of the accumulated shuffle mask to make it easier
40826 // to match below. All this does is detect masks with sequential pairs of
40827 // elements, and shrink them to the half-width mask. It does this in a loop
40828 // so it will reduce the size of the mask to the minimal width mask which
40829 // performs an equivalent shuffle.
40830 while (WideMask.size() > 1) {
40831 SmallVector<int, 64> WidenedMask;
40832 if (!canWidenShuffleElements(WideMask, WidenedMask))
40833 break;
40834 WideMask = std::move(WidenedMask);
40835 }
40836
40837 // Canonicalization of binary shuffle masks to improve pattern matching by
40838 // commuting the inputs.
40839 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40841 std::swap(WideInputs[0], WideInputs[1]);
40842 }
40843
40844 // Increase depth for every upper subvector we've peeked through.
40845 Depth += AdjustedMasks;
40846
40847 // Attempt to combine wider chain.
40848 // TODO: Can we use a better Root?
40849 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40850 WideInputs.back().getValueSizeInBits()
40851 ? WideInputs.front()
40852 : WideInputs.back();
40853 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
40854 "WideRootSize mismatch");
40855
40856 if (SDValue WideShuffle = combineX86ShuffleChain(
40857 WideInputs, RootOpcode, WideRoot.getSimpleValueType(), WideMask,
40858 Depth, SrcNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40859 IsMaskedShuffle, DAG, SDLoc(WideRoot), Subtarget)) {
40860 WideShuffle = extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits);
40861 return DAG.getBitcast(RootVT, WideShuffle);
40862 }
40863
40864 return SDValue();
40865}
40866
40867// Canonicalize the combined shuffle mask chain with horizontal ops.
40868// NOTE: This may update the Ops and Mask.
40871 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
40872 const X86Subtarget &Subtarget) {
40873 if (Mask.empty() || Ops.empty())
40874 return SDValue();
40875
40877 for (SDValue Op : Ops)
40879
40880 // All ops must be the same horizop + type.
40881 SDValue BC0 = BC[0];
40882 EVT VT0 = BC0.getValueType();
40883 unsigned Opcode0 = BC0.getOpcode();
40884 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
40885 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
40886 }))
40887 return SDValue();
40888
40889 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
40890 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
40891 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
40892 if (!isHoriz && !isPack)
40893 return SDValue();
40894
40895 // Do all ops have a single use?
40896 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
40897 return Op.hasOneUse() &&
40899 });
40900
40901 int NumElts = VT0.getVectorNumElements();
40902 int NumLanes = VT0.getSizeInBits() / 128;
40903 int NumEltsPerLane = NumElts / NumLanes;
40904 int NumHalfEltsPerLane = NumEltsPerLane / 2;
40905 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
40906 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
40907
40908 if (NumEltsPerLane >= 4 &&
40909 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
40910 SmallVector<int> LaneMask, ScaledMask;
40911 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
40912 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
40913 // See if we can remove the shuffle by resorting the HOP chain so that
40914 // the HOP args are pre-shuffled.
40915 // TODO: Generalize to any sized/depth chain.
40916 // TODO: Add support for PACKSS/PACKUS.
40917 if (isHoriz) {
40918 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
40919 auto GetHOpSrc = [&](int M) {
40920 if (M == SM_SentinelUndef)
40921 return DAG.getUNDEF(VT0);
40922 if (M == SM_SentinelZero)
40923 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
40924 SDValue Src0 = BC[M / 4];
40925 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
40926 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
40927 return Src1.getOperand(M % 2);
40928 return SDValue();
40929 };
40930 SDValue M0 = GetHOpSrc(ScaledMask[0]);
40931 SDValue M1 = GetHOpSrc(ScaledMask[1]);
40932 SDValue M2 = GetHOpSrc(ScaledMask[2]);
40933 SDValue M3 = GetHOpSrc(ScaledMask[3]);
40934 if (M0 && M1 && M2 && M3) {
40935 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
40936 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
40937 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40938 }
40939 }
40940 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
40941 if (Ops.size() >= 2) {
40942 SDValue LHS, RHS;
40943 auto GetHOpSrc = [&](int M, int &OutM) {
40944 // TODO: Support SM_SentinelZero
40945 if (M < 0)
40946 return M == SM_SentinelUndef;
40947 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
40948 if (!LHS || LHS == Src) {
40949 LHS = Src;
40950 OutM = (M % 2);
40951 return true;
40952 }
40953 if (!RHS || RHS == Src) {
40954 RHS = Src;
40955 OutM = (M % 2) + 2;
40956 return true;
40957 }
40958 return false;
40959 };
40960 int PostMask[4] = {-1, -1, -1, -1};
40961 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
40962 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
40963 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
40964 GetHOpSrc(ScaledMask[3], PostMask[3])) {
40965 LHS = DAG.getBitcast(SrcVT, LHS);
40966 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
40967 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40968 // Use SHUFPS for the permute so this will work on SSE2 targets,
40969 // shuffle combining and domain handling will simplify this later on.
40970 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
40971 Res = DAG.getBitcast(ShuffleVT, Res);
40972 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
40973 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
40974 }
40975 }
40976 }
40977 }
40978
40979 if (2 < Ops.size())
40980 return SDValue();
40981
40982 SDValue BC1 = BC[BC.size() - 1];
40983 if (Mask.size() == VT0.getVectorNumElements()) {
40984 // Canonicalize binary shuffles of horizontal ops that use the
40985 // same sources to an unary shuffle.
40986 // TODO: Try to perform this fold even if the shuffle remains.
40987 if (Ops.size() == 2) {
40988 auto ContainsOps = [](SDValue HOp, SDValue Op) {
40989 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
40990 };
40991 // Commute if all BC0's ops are contained in BC1.
40992 if (ContainsOps(BC1, BC0.getOperand(0)) &&
40993 ContainsOps(BC1, BC0.getOperand(1))) {
40995 std::swap(Ops[0], Ops[1]);
40996 std::swap(BC0, BC1);
40997 }
40998
40999 // If BC1 can be represented by BC0, then convert to unary shuffle.
41000 if (ContainsOps(BC0, BC1.getOperand(0)) &&
41001 ContainsOps(BC0, BC1.getOperand(1))) {
41002 for (int &M : Mask) {
41003 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
41004 continue;
41005 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
41006 M -= NumElts + (SubLane * NumHalfEltsPerLane);
41007 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
41008 M += NumHalfEltsPerLane;
41009 }
41010 }
41011 }
41012
41013 // Canonicalize unary horizontal ops to only refer to lower halves.
41014 for (int i = 0; i != NumElts; ++i) {
41015 int &M = Mask[i];
41016 if (isUndefOrZero(M))
41017 continue;
41018 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
41019 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
41020 M -= NumHalfEltsPerLane;
41021 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
41022 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
41023 M -= NumHalfEltsPerLane;
41024 }
41025 }
41026
41027 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
41028 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
41029 // represents the LHS/RHS inputs for the lower/upper halves.
41030 SmallVector<int, 16> TargetMask128, WideMask128;
41031 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
41032 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
41033 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
41034 bool SingleOp = (Ops.size() == 1);
41035 if (isPack || OneUseOps ||
41036 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
41037 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
41038 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
41039 Lo = Lo.getOperand(WideMask128[0] & 1);
41040 Hi = Hi.getOperand(WideMask128[1] & 1);
41041 if (SingleOp) {
41042 SDValue Undef = DAG.getUNDEF(SrcVT);
41043 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
41044 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
41045 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
41046 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
41047 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
41048 }
41049 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
41050 }
41051 }
41052
41053 // If we are post-shuffling a 256-bit hop and not requiring the upper
41054 // elements, then try to narrow to a 128-bit hop directly.
41055 SmallVector<int, 16> WideMask64;
41056 if (Ops.size() == 1 && NumLanes == 2 &&
41057 scaleShuffleElements(Mask, 4, WideMask64) &&
41058 isUndefInRange(WideMask64, 2, 2)) {
41059 int M0 = WideMask64[0];
41060 int M1 = WideMask64[1];
41061 if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
41063 unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41064 unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41065 SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
41066 SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
41067 SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
41068 return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
41069 }
41070 }
41071
41072 return SDValue();
41073}
41074
41075// Attempt to constant fold all of the constant source ops.
41076// Returns true if the entire shuffle is folded to a constant.
41077// TODO: Extend this to merge multiple constant Ops and update the mask.
41079 ArrayRef<int> Mask,
41080 ArrayRef<const SDNode *> SrcNodes,
41081 SelectionDAG &DAG, const SDLoc &DL,
41082 const X86Subtarget &Subtarget) {
41083 unsigned SizeInBits = VT.getSizeInBits();
41084 unsigned NumMaskElts = Mask.size();
41085 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
41086 unsigned NumOps = Ops.size();
41087
41088 // Extract constant bits from each source op.
41089 SmallVector<APInt, 16> UndefEltsOps(NumOps);
41091 for (unsigned I = 0; I != NumOps; ++I)
41092 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
41093 RawBitsOps[I],
41094 /*AllowWholeUndefs*/ true,
41095 /*AllowPartialUndefs*/ true))
41096 return SDValue();
41097
41098 // If we're optimizing for size, only fold if at least one of the constants is
41099 // only used once or the combined shuffle has included a variable mask
41100 // shuffle, this is to avoid constant pool bloat.
41101 bool IsOptimizingSize = DAG.shouldOptForSize();
41102 bool HasVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
41103 return isTargetShuffleVariableMask(N->getOpcode());
41104 });
41105 if (IsOptimizingSize && !HasVariableMask &&
41106 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
41107 return SDValue();
41108
41109 // Shuffle the constant bits according to the mask.
41110 APInt UndefElts(NumMaskElts, 0);
41111 APInt ZeroElts(NumMaskElts, 0);
41112 APInt ConstantElts(NumMaskElts, 0);
41113 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
41114 APInt::getZero(MaskSizeInBits));
41115 for (unsigned i = 0; i != NumMaskElts; ++i) {
41116 int M = Mask[i];
41117 if (M == SM_SentinelUndef) {
41118 UndefElts.setBit(i);
41119 continue;
41120 } else if (M == SM_SentinelZero) {
41121 ZeroElts.setBit(i);
41122 continue;
41123 }
41124 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
41125
41126 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
41127 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
41128
41129 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
41130 if (SrcUndefElts[SrcMaskIdx]) {
41131 UndefElts.setBit(i);
41132 continue;
41133 }
41134
41135 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
41136 APInt &Bits = SrcEltBits[SrcMaskIdx];
41137 if (!Bits) {
41138 ZeroElts.setBit(i);
41139 continue;
41140 }
41141
41142 ConstantElts.setBit(i);
41143 ConstantBitData[i] = Bits;
41144 }
41145 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
41146
41147 // Attempt to create a zero vector.
41148 if ((UndefElts | ZeroElts).isAllOnes())
41149 return getZeroVector(VT, Subtarget, DAG, DL);
41150
41151 // Create the constant data.
41152 MVT MaskSVT;
41153 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
41154 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
41155 else
41156 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
41157
41158 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
41159 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
41160 return SDValue();
41161
41162 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
41163 return DAG.getBitcast(VT, CstOp);
41164}
41165
41166namespace llvm {
41167 namespace X86 {
41168 enum {
41170 };
41171 } // namespace X86
41172} // namespace llvm
41173
41174/// Fully generic combining of x86 shuffle instructions.
41175///
41176/// This should be the last combine run over the x86 shuffle instructions. Once
41177/// they have been fully optimized, this will recursively consider all chains
41178/// of single-use shuffle instructions, build a generic model of the cumulative
41179/// shuffle operation, and check for simpler instructions which implement this
41180/// operation. We use this primarily for two purposes:
41181///
41182/// 1) Collapse generic shuffles to specialized single instructions when
41183/// equivalent. In most cases, this is just an encoding size win, but
41184/// sometimes we will collapse multiple generic shuffles into a single
41185/// special-purpose shuffle.
41186/// 2) Look for sequences of shuffle instructions with 3 or more total
41187/// instructions, and replace them with the slightly more expensive SSSE3
41188/// PSHUFB instruction if available. We do this as the last combining step
41189/// to ensure we avoid using PSHUFB if we can implement the shuffle with
41190/// a suitable short sequence of other instructions. The PSHUFB will either
41191/// use a register or have to read from memory and so is slightly (but only
41192/// slightly) more expensive than the other shuffle instructions.
41193///
41194/// Because this is inherently a quadratic operation (for each shuffle in
41195/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
41196/// This should never be an issue in practice as the shuffle lowering doesn't
41197/// produce sequences of more than 8 instructions.
41198///
41199/// FIXME: We will currently miss some cases where the redundant shuffling
41200/// would simplify under the threshold for PSHUFB formation because of
41201/// combine-ordering. To fix this, we should do the redundant instruction
41202/// combining in this recursive walk.
41204 ArrayRef<SDValue> SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT,
41205 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
41206 unsigned MaxDepth, bool AllowVariableCrossLaneMask,
41207 bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG,
41208 const SDLoc &DL, const X86Subtarget &Subtarget) {
41209 assert(!RootMask.empty() &&
41210 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
41211 "Illegal shuffle root mask");
41212 assert(RootVT.isVector() && "Shuffles operate on vector types!");
41213 unsigned RootSizeInBits = RootVT.getSizeInBits();
41214 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41215
41216 // Bound the depth of our recursive combine because this is ultimately
41217 // quadratic in nature.
41218 if (Depth >= MaxDepth)
41219 return SDValue();
41220
41221 // Directly rip through bitcasts to find the underlying operand.
41222 SDValue Op = SrcOps[SrcOpIndex];
41224
41225 EVT VT = Op.getValueType();
41226 if (!VT.isVector() || !VT.isSimple())
41227 return SDValue(); // Bail if we hit a non-simple non-vector.
41228
41229 // FIXME: Just bail on f16 for now.
41230 if (VT.getVectorElementType() == MVT::f16)
41231 return SDValue();
41232
41233 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
41234 "Can only combine shuffles upto size of the root op.");
41235
41236 // Create a demanded elts mask from the referenced elements of Op.
41237 APInt OpDemandedElts = APInt::getZero(RootMask.size());
41238 for (int M : RootMask) {
41239 int BaseIdx = RootMask.size() * SrcOpIndex;
41240 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
41241 OpDemandedElts.setBit(M - BaseIdx);
41242 }
41243 if (RootSizeInBits != VT.getSizeInBits()) {
41244 // Op is smaller than Root - extract the demanded elts for the subvector.
41245 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
41246 unsigned NumOpMaskElts = RootMask.size() / Scale;
41247 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
41248 assert(OpDemandedElts
41249 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
41250 .isZero() &&
41251 "Out of range elements referenced in root mask");
41252 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
41253 }
41254 OpDemandedElts =
41255 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
41256
41257 // Extract target shuffle mask and resolve sentinels and inputs.
41258 SmallVector<int, 64> OpMask;
41259 SmallVector<SDValue, 2> OpInputs;
41260 APInt OpUndef, OpZero;
41261 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
41262 OpZero, DAG, Depth, false)) {
41263 // Shuffle inputs must not be larger than the shuffle result.
41264 // TODO: Relax this for single input faux shuffles (e.g. trunc).
41265 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
41266 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
41267 }))
41268 return SDValue();
41269 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41270 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
41271 !isNullConstant(Op.getOperand(1))) {
41272 SDValue SrcVec = Op.getOperand(0);
41273 int ExtractIdx = Op.getConstantOperandVal(1);
41274 unsigned NumElts = VT.getVectorNumElements();
41275 OpInputs.assign({SrcVec});
41276 OpMask.assign(NumElts, SM_SentinelUndef);
41277 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
41278 OpZero = OpUndef = APInt::getZero(NumElts);
41279 } else {
41280 return SDValue();
41281 }
41282
41283 // If the shuffle result was smaller than the root, we need to adjust the
41284 // mask indices and pad the mask with undefs.
41285 if (RootSizeInBits > VT.getSizeInBits()) {
41286 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
41287 unsigned OpMaskSize = OpMask.size();
41288 if (OpInputs.size() > 1) {
41289 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
41290 for (int &M : OpMask) {
41291 if (M < 0)
41292 continue;
41293 int EltIdx = M % OpMaskSize;
41294 int OpIdx = M / OpMaskSize;
41295 M = (PaddedMaskSize * OpIdx) + EltIdx;
41296 }
41297 }
41298 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
41299 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
41300 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
41301 }
41302
41305
41306 // We don't need to merge masks if the root is empty.
41307 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
41308 if (EmptyRoot) {
41309 // Only resolve zeros if it will remove an input, otherwise we might end
41310 // up in an infinite loop.
41311 bool ResolveKnownZeros = true;
41312 if (!OpZero.isZero()) {
41313 APInt UsedInputs = APInt::getZero(OpInputs.size());
41314 for (int i = 0, e = OpMask.size(); i != e; ++i) {
41315 int M = OpMask[i];
41316 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
41317 continue;
41318 UsedInputs.setBit(M / OpMask.size());
41319 if (UsedInputs.isAllOnes()) {
41320 ResolveKnownZeros = false;
41321 break;
41322 }
41323 }
41324 }
41325 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
41326 ResolveKnownZeros);
41327
41328 Mask = OpMask;
41329 Ops.append(OpInputs.begin(), OpInputs.end());
41330 } else {
41331 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
41332
41333 // Add the inputs to the Ops list, avoiding duplicates.
41334 Ops.append(SrcOps.begin(), SrcOps.end());
41335
41336 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
41337 // Attempt to find an existing match.
41339 for (int i = 0, e = Ops.size(); i < e; ++i)
41340 if (InputBC == peekThroughBitcasts(Ops[i]))
41341 return i;
41342 // Match failed - should we replace an existing Op?
41343 if (InsertionPoint >= 0) {
41345 return InsertionPoint;
41346 }
41347 // Add to the end of the Ops list.
41348 Ops.push_back(Input);
41349 return Ops.size() - 1;
41350 };
41351
41352 SmallVector<int, 2> OpInputIdx;
41353 for (SDValue OpInput : OpInputs)
41354 OpInputIdx.push_back(
41355 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
41356
41357 assert(((RootMask.size() > OpMask.size() &&
41358 RootMask.size() % OpMask.size() == 0) ||
41359 (OpMask.size() > RootMask.size() &&
41360 OpMask.size() % RootMask.size() == 0) ||
41361 OpMask.size() == RootMask.size()) &&
41362 "The smaller number of elements must divide the larger.");
41363
41364 // This function can be performance-critical, so we rely on the power-of-2
41365 // knowledge that we have about the mask sizes to replace div/rem ops with
41366 // bit-masks and shifts.
41368 "Non-power-of-2 shuffle mask sizes");
41370 "Non-power-of-2 shuffle mask sizes");
41371 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
41372 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
41373
41374 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
41375 unsigned RootRatio =
41376 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
41377 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
41378 assert((RootRatio == 1 || OpRatio == 1) &&
41379 "Must not have a ratio for both incoming and op masks!");
41380
41381 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
41382 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
41383 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
41384 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
41385 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
41386
41387 Mask.resize(MaskWidth, SM_SentinelUndef);
41388
41389 // Merge this shuffle operation's mask into our accumulated mask. Note that
41390 // this shuffle's mask will be the first applied to the input, followed by
41391 // the root mask to get us all the way to the root value arrangement. The
41392 // reason for this order is that we are recursing up the operation chain.
41393 for (unsigned i = 0; i < MaskWidth; ++i) {
41394 unsigned RootIdx = i >> RootRatioLog2;
41395 if (RootMask[RootIdx] < 0) {
41396 // This is a zero or undef lane, we're done.
41397 Mask[i] = RootMask[RootIdx];
41398 continue;
41399 }
41400
41401 unsigned RootMaskedIdx =
41402 RootRatio == 1
41403 ? RootMask[RootIdx]
41404 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
41405
41406 // Just insert the scaled root mask value if it references an input other
41407 // than the SrcOp we're currently inserting.
41408 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
41409 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
41410 Mask[i] = RootMaskedIdx;
41411 continue;
41412 }
41413
41414 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
41415 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
41416 if (OpMask[OpIdx] < 0) {
41417 // The incoming lanes are zero or undef, it doesn't matter which ones we
41418 // are using.
41419 Mask[i] = OpMask[OpIdx];
41420 continue;
41421 }
41422
41423 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
41424 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
41425 : (OpMask[OpIdx] << OpRatioLog2) +
41426 (RootMaskedIdx & (OpRatio - 1));
41427
41428 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
41429 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
41430 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
41431 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
41432
41433 Mask[i] = OpMaskedIdx;
41434 }
41435 }
41436
41437 // Peek through any free bitcasts to insert_subvector vector widenings or
41438 // extract_subvector nodes back to root size.
41439 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
41440 for (auto [I, Op] : enumerate(Ops)) {
41441 SDValue BC = Op;
41442 while (1) {
41443 if (BC.getOpcode() == ISD::BITCAST && BC.hasOneUse()) {
41444 BC = BC.getOperand(0);
41445 continue;
41446 }
41447 if (BC.getOpcode() == ISD::INSERT_SUBVECTOR &&
41448 BC.getOperand(0).isUndef() && isNullConstant(BC.getOperand(2))) {
41449 // Set out of bounds mask indices to undef.
41450 Op = BC = BC.getOperand(1);
41451 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
41452 int Lo = I * Mask.size();
41453 int Hi = (I + 1) * Mask.size();
41454 int NewHi = Lo + (Mask.size() / Scale);
41455 for (int &M : Mask) {
41456 if (Lo <= M && NewHi <= M && M < Hi)
41457 M = SM_SentinelUndef;
41458 }
41459 continue;
41460 }
41461 if (BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41462 (RootSizeInBits % BC.getOperand(0).getValueSizeInBits()) == 0 &&
41463 isNullConstant(BC.getOperand(1))) {
41464 Op = BC = BC.getOperand(0);
41465 continue;
41466 }
41467 break;
41468 }
41469 }
41470
41471 // Remove unused/repeated shuffle source ops.
41473
41474 // Handle the all undef/zero/ones cases early.
41475 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41476 return DAG.getUNDEF(RootVT);
41477 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41478 return getZeroVector(RootVT, Subtarget, DAG, DL);
41479 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41481 return getOnesVector(RootVT, DAG, DL);
41482
41483 assert(!Ops.empty() && "Shuffle with no inputs detected");
41484
41485 // Update the list of shuffle nodes that have been combined so far.
41486 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes);
41487 CombinedNodes.push_back(Op.getNode());
41488
41489 // See if we can recurse into each shuffle source op (if it's a target
41490 // shuffle). The source op should only be generally combined if it either has
41491 // a single use (i.e. current Op) or all its users have already been combined,
41492 // if not then we can still combine but should prevent generation of variable
41493 // shuffles to avoid constant pool bloat.
41494 // Don't recurse if we already have more source ops than we can combine in
41495 // the remaining recursion depth.
41496 if (Ops.size() < (MaxDepth - Depth)) {
41497 for (int i = 0, e = Ops.size(); i < e; ++i) {
41498 // For empty roots, we need to resolve zeroable elements before combining
41499 // them with other shuffles.
41500 SmallVector<int, 64> ResolvedMask = Mask;
41501 if (EmptyRoot)
41502 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
41503 bool AllowCrossLaneVar = false;
41504 bool AllowPerLaneVar = false;
41505 if (Ops[i].getNode()->hasOneUse() ||
41506 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
41507 AllowCrossLaneVar = AllowVariableCrossLaneMask;
41508 AllowPerLaneVar = AllowVariablePerLaneMask;
41509 }
41511 Ops, i, RootOpc, RootVT, ResolvedMask, CombinedNodes, Depth + 1,
41512 MaxDepth, AllowCrossLaneVar, AllowPerLaneVar, IsMaskedShuffle,
41513 DAG, DL, Subtarget))
41514 return Res;
41515 }
41516 }
41517
41518 // Attempt to constant fold all of the constant source ops.
41520 RootVT, Ops, Mask, CombinedNodes, DAG, DL, Subtarget))
41521 return Cst;
41522
41523 // If constant fold failed and we only have constants - then we have
41524 // multiple uses by a single non-variable shuffle - just bail.
41525 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
41526 APInt UndefElts;
41527 SmallVector<APInt> RawBits;
41528 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
41529 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41530 RawBits,
41531 /*AllowWholeUndefs*/ true,
41532 /*AllowPartialUndefs*/ true);
41533 })) {
41534 return SDValue();
41535 }
41536
41537 // Canonicalize the combined shuffle mask chain with horizontal ops.
41538 // NOTE: This will update the Ops and Mask.
41540 Ops, Mask, RootSizeInBits, DL, DAG, Subtarget))
41541 return DAG.getBitcast(RootVT, HOp);
41542
41543 // Try to refine our inputs given our knowledge of target shuffle mask.
41544 for (auto I : enumerate(Ops)) {
41545 int OpIdx = I.index();
41546 SDValue &Op = I.value();
41547
41548 // What range of shuffle mask element values results in picking from Op?
41549 int Lo = OpIdx * Mask.size();
41550 int Hi = Lo + Mask.size();
41551
41552 // Which elements of Op do we demand, given the mask's granularity?
41553 APInt OpDemandedElts(Mask.size(), 0);
41554 for (int MaskElt : Mask) {
41555 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
41556 int OpEltIdx = MaskElt - Lo;
41557 OpDemandedElts.setBit(OpEltIdx);
41558 }
41559 }
41560
41561 // Is the shuffle result smaller than the root?
41562 if (Op.getValueSizeInBits() < RootSizeInBits) {
41563 // We padded the mask with undefs. But we now need to undo that.
41564 unsigned NumExpectedVectorElts = Mask.size();
41565 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
41566 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
41567 assert(!OpDemandedElts.extractBits(
41568 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
41569 "Demanding the virtual undef widening padding?");
41570 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
41571 }
41572
41573 // The Op itself may be of different VT, so we need to scale the mask.
41574 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
41575 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
41576
41577 // Can this operand be simplified any further, given it's demanded elements?
41579 Op, OpScaledDemandedElts, DAG))
41580 Op = NewOp;
41581 }
41582 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
41583
41584 // Widen any subvector shuffle inputs we've collected.
41585 // TODO: Remove this to avoid generating temporary nodes, we should only
41586 // widen once combineX86ShuffleChain has found a match.
41587 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
41588 return Op.getValueSizeInBits() < RootSizeInBits;
41589 })) {
41590 for (SDValue &Op : Ops)
41591 if (Op.getValueSizeInBits() < RootSizeInBits)
41592 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
41593 RootSizeInBits);
41594 // Reresolve - we might have repeated subvector sources.
41596 }
41597
41598 // Handle the all undef/zero/ones cases.
41599 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41600 return DAG.getUNDEF(RootVT);
41601 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41602 return getZeroVector(RootVT, Subtarget, DAG, DL);
41603 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41605 return getOnesVector(RootVT, DAG, DL);
41606
41607 assert(!Ops.empty() && "Shuffle with no inputs detected");
41608
41609 // We can only combine unary and binary shuffle mask cases.
41610 if (Ops.size() <= 2) {
41611 // Minor canonicalization of the accumulated shuffle mask to make it easier
41612 // to match below. All this does is detect masks with sequential pairs of
41613 // elements, and shrink them to the half-width mask. It does this in a loop
41614 // so it will reduce the size of the mask to the minimal width mask which
41615 // performs an equivalent shuffle.
41616 while (Mask.size() > 1) {
41617 SmallVector<int, 64> WidenedMask;
41618 if (!canWidenShuffleElements(Mask, WidenedMask))
41619 break;
41620 Mask = std::move(WidenedMask);
41621 }
41622
41623 // Canonicalization of binary shuffle masks to improve pattern matching by
41624 // commuting the inputs.
41625 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
41627 std::swap(Ops[0], Ops[1]);
41628 }
41629
41630 // Try to combine into a single shuffle instruction.
41631 if (SDValue Shuffle = combineX86ShuffleChain(
41632 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41633 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
41634 IsMaskedShuffle, DAG, DL, Subtarget))
41635 return Shuffle;
41636
41637 // If all the operands come from the same larger vector, fallthrough and try
41638 // to use combineX86ShuffleChainWithExtract.
41641 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41642 (RootSizeInBits / Mask.size()) != 64 ||
41643 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41644 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41645 LHS.getOperand(0) != RHS.getOperand(0))
41646 return SDValue();
41647 }
41648
41649 // If that failed and any input is extracted then try to combine as a
41650 // shuffle with the larger type.
41652 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41653 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
41654 DAG, DL, Subtarget);
41655}
41656
41657/// Helper entry wrapper to combineX86ShufflesRecursively.
41659 const X86Subtarget &Subtarget) {
41661 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), {0}, {}, /*Depth=*/0,
41662 X86::MaxShuffleCombineDepth, /*AllowVariableCrossLaneMask=*/true,
41663 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget), DAG,
41664 SDLoc(Op), Subtarget);
41665}
41666
41667/// Get the PSHUF-style mask from PSHUF node.
41668///
41669/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
41670/// PSHUF-style masks that can be reused with such instructions.
41672 MVT VT = N.getSimpleValueType();
41675 bool HaveMask = getTargetShuffleMask(N, false, Ops, Mask);
41676 (void)HaveMask;
41677 assert(HaveMask);
41678
41679 // If we have more than 128-bits, only the low 128-bits of shuffle mask
41680 // matter. Check that the upper masks are repeats and remove them.
41681 if (VT.getSizeInBits() > 128) {
41682 int LaneElts = 128 / VT.getScalarSizeInBits();
41683#ifndef NDEBUG
41684 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
41685 for (int j = 0; j < LaneElts; ++j)
41686 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
41687 "Mask doesn't repeat in high 128-bit lanes!");
41688#endif
41689 Mask.resize(LaneElts);
41690 }
41691
41692 switch (N.getOpcode()) {
41693 case X86ISD::PSHUFD:
41694 return Mask;
41695 case X86ISD::PSHUFLW:
41696 Mask.resize(4);
41697 return Mask;
41698 case X86ISD::PSHUFHW:
41699 Mask.erase(Mask.begin(), Mask.begin() + 4);
41700 for (int &M : Mask)
41701 M -= 4;
41702 return Mask;
41703 default:
41704 llvm_unreachable("No valid shuffle instruction found!");
41705 }
41706}
41707
41708/// Get the expanded blend mask from a BLENDI node.
41709/// For v16i16 nodes, this will splat the repeated i8 mask.
41711 assert(V.getOpcode() == X86ISD::BLENDI && "Unknown blend shuffle");
41712 unsigned NumElts = V.getSimpleValueType().getVectorNumElements();
41713 APInt Mask = V.getConstantOperandAPInt(2);
41714 if (Mask.getBitWidth() > NumElts)
41715 Mask = Mask.trunc(NumElts);
41716 if (NumElts == 16) {
41717 assert(Mask.getBitWidth() == 8 && "Unexpected v16i16 blend mask width");
41718 Mask = APInt::getSplat(16, Mask);
41719 }
41720 assert(Mask.getBitWidth() == NumElts && "Unexpected blend mask width");
41721 return Mask;
41722}
41723
41724/// Search for a combinable shuffle across a chain ending in pshufd.
41725///
41726/// We walk up the chain and look for a combinable shuffle, skipping over
41727/// shuffles that we could hoist this shuffle's transformation past without
41728/// altering anything.
41731 const SDLoc &DL,
41732 SelectionDAG &DAG) {
41733 assert(N.getOpcode() == X86ISD::PSHUFD &&
41734 "Called with something other than an x86 128-bit half shuffle!");
41735
41736 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
41737 // of the shuffles in the chain so that we can form a fresh chain to replace
41738 // this one.
41740 SDValue V = N.getOperand(0);
41741 for (; V.hasOneUse(); V = V.getOperand(0)) {
41742 switch (V.getOpcode()) {
41743 default:
41744 return SDValue(); // Nothing combined!
41745
41746 case ISD::BITCAST:
41747 // Skip bitcasts as we always know the type for the target specific
41748 // instructions.
41749 continue;
41750
41751 case X86ISD::PSHUFD:
41752 // Found another dword shuffle.
41753 break;
41754
41755 case X86ISD::PSHUFLW:
41756 // Check that the low words (being shuffled) are the identity in the
41757 // dword shuffle, and the high words are self-contained.
41758 if (Mask[0] != 0 || Mask[1] != 1 ||
41759 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
41760 return SDValue();
41761
41762 Chain.push_back(V);
41763 continue;
41764
41765 case X86ISD::PSHUFHW:
41766 // Check that the high words (being shuffled) are the identity in the
41767 // dword shuffle, and the low words are self-contained.
41768 if (Mask[2] != 2 || Mask[3] != 3 ||
41769 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
41770 return SDValue();
41771
41772 Chain.push_back(V);
41773 continue;
41774
41775 case X86ISD::UNPCKL:
41776 case X86ISD::UNPCKH:
41777 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
41778 // shuffle into a preceding word shuffle.
41779 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
41780 V.getSimpleValueType().getVectorElementType() != MVT::i16)
41781 return SDValue();
41782
41783 // Search for a half-shuffle which we can combine with.
41784 unsigned CombineOp =
41785 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
41786 if (V.getOperand(0) != V.getOperand(1) ||
41787 !V->isOnlyUserOf(V.getOperand(0).getNode()))
41788 return SDValue();
41789 Chain.push_back(V);
41790 V = V.getOperand(0);
41791 do {
41792 switch (V.getOpcode()) {
41793 default:
41794 return SDValue(); // Nothing to combine.
41795
41796 case X86ISD::PSHUFLW:
41797 case X86ISD::PSHUFHW:
41798 if (V.getOpcode() == CombineOp)
41799 break;
41800
41801 Chain.push_back(V);
41802
41803 [[fallthrough]];
41804 case ISD::BITCAST:
41805 V = V.getOperand(0);
41806 continue;
41807 }
41808 break;
41809 } while (V.hasOneUse());
41810 break;
41811 }
41812 // Break out of the loop if we break out of the switch.
41813 break;
41814 }
41815
41816 if (!V.hasOneUse())
41817 // We fell out of the loop without finding a viable combining instruction.
41818 return SDValue();
41819
41820 // Merge this node's mask and our incoming mask.
41822 for (int &M : Mask)
41823 M = VMask[M];
41824 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
41825 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41826
41827 // Rebuild the chain around this new shuffle.
41828 while (!Chain.empty()) {
41829 SDValue W = Chain.pop_back_val();
41830
41831 if (V.getValueType() != W.getOperand(0).getValueType())
41832 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
41833
41834 switch (W.getOpcode()) {
41835 default:
41836 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
41837
41838 case X86ISD::UNPCKL:
41839 case X86ISD::UNPCKH:
41840 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
41841 break;
41842
41843 case X86ISD::PSHUFD:
41844 case X86ISD::PSHUFLW:
41845 case X86ISD::PSHUFHW:
41846 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
41847 break;
41848 }
41849 }
41850 if (V.getValueType() != N.getValueType())
41851 V = DAG.getBitcast(N.getValueType(), V);
41852
41853 // Return the new chain to replace N.
41854 return V;
41855}
41856
41857// Attempt to commute shufps LHS loads:
41858// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
41860 SelectionDAG &DAG) {
41861 // TODO: Add vXf64 support.
41862 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
41863 return SDValue();
41864
41865 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
41866 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
41867 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
41868 return SDValue();
41869 SDValue N0 = V.getOperand(0);
41870 SDValue N1 = V.getOperand(1);
41871 unsigned Imm = V.getConstantOperandVal(2);
41872 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
41873 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
41875 return SDValue();
41876 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
41877 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
41878 DAG.getTargetConstant(Imm, DL, MVT::i8));
41879 };
41880
41881 switch (N.getOpcode()) {
41882 case X86ISD::VPERMILPI:
41883 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
41884 unsigned Imm = N.getConstantOperandVal(1);
41885 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
41886 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41887 }
41888 break;
41889 case X86ISD::SHUFP: {
41890 SDValue N0 = N.getOperand(0);
41891 SDValue N1 = N.getOperand(1);
41892 unsigned Imm = N.getConstantOperandVal(2);
41893 if (N0 == N1) {
41894 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
41895 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
41896 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41897 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
41898 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
41899 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
41900 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
41901 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
41902 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
41903 }
41904 break;
41905 }
41906 }
41907
41908 return SDValue();
41909}
41910
41911// Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
41912// iff we don't demand the same element index for both X and Y.
41913static SDValue
41915 const APInt &DemandedElts, SelectionDAG &DAG,
41916 const X86Subtarget &Subtarget, const SDLoc &DL) {
41917 assert(isBlendOrUndef(BlendMask) && "Blend shuffle expected");
41918 if (!N0.hasOneUse() || !N1.hasOneUse())
41919 return SDValue();
41920
41921 unsigned NumElts = VT.getVectorNumElements();
41924
41925 // See if both operands are shuffles, and that we can scale the shuffle masks
41926 // to the same width as the blend mask.
41927 // TODO: Support SM_SentinelZero?
41928 SmallVector<SDValue, 2> Ops0, Ops1;
41929 SmallVector<int, 32> Mask0, Mask1, ScaledMask0, ScaledMask1;
41930 if (!getTargetShuffleMask(BC0, /*AllowSentinelZero=*/false, Ops0, Mask0) ||
41931 !getTargetShuffleMask(BC1, /*AllowSentinelZero=*/false, Ops1, Mask1) ||
41932 !scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||
41933 !scaleShuffleElements(Mask1, NumElts, ScaledMask1))
41934 return SDValue();
41935
41936 // Determine the demanded elts from both permutes.
41937 APInt Demanded0, DemandedLHS0, DemandedRHS0;
41938 APInt Demanded1, DemandedLHS1, DemandedRHS1;
41939 if (!getShuffleDemandedElts(NumElts, BlendMask, DemandedElts, Demanded0,
41940 Demanded1,
41941 /*AllowUndefElts=*/true) ||
41942 !getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,
41943 DemandedRHS0, /*AllowUndefElts=*/true) ||
41944 !getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,
41945 DemandedRHS1, /*AllowUndefElts=*/true))
41946 return SDValue();
41947
41948 // Confirm that we only use a single operand from both permutes and that we
41949 // don't demand the same index from both.
41950 if (!DemandedRHS0.isZero() || !DemandedRHS1.isZero() ||
41951 DemandedLHS0.intersects(DemandedLHS1))
41952 return SDValue();
41953
41954 // Use the permute demanded elts masks as the new blend mask.
41955 // Create the new permute mask as a blend of the 2 original permute masks.
41956 SmallVector<int, 32> NewBlendMask(NumElts, SM_SentinelUndef);
41957 SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);
41958 for (unsigned I = 0; I != NumElts; ++I) {
41959 if (Demanded0[I]) {
41960 int M = ScaledMask0[I];
41961 if (0 <= M) {
41962 assert(isUndefOrEqual(NewBlendMask[M], M) &&
41963 "BlendMask demands LHS AND RHS");
41964 NewBlendMask[M] = M;
41965 NewPermuteMask[I] = M;
41966 }
41967 } else if (Demanded1[I]) {
41968 int M = ScaledMask1[I];
41969 if (0 <= M) {
41970 assert(isUndefOrEqual(NewBlendMask[M], M + NumElts) &&
41971 "BlendMask demands LHS AND RHS");
41972 NewBlendMask[M] = M + NumElts;
41973 NewPermuteMask[I] = M;
41974 }
41975 }
41976 }
41977 assert(isBlendOrUndef(NewBlendMask) && "Bad blend");
41978 assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
41979
41980 // v16i16 shuffles can explode in complexity very easily, only accept them if
41981 // the blend mask is the same in the 128-bit subvectors (or can widen to
41982 // v8i32) and the permute can be widened as well.
41983 if (VT == MVT::v16i16) {
41984 if (!is128BitLaneRepeatedShuffleMask(VT, NewBlendMask) &&
41985 !canWidenShuffleElements(NewBlendMask))
41986 return SDValue();
41987 if (!canWidenShuffleElements(NewPermuteMask))
41988 return SDValue();
41989 }
41990
41991 // Don't introduce lane-crossing permutes without AVX2, unless it can be
41992 // widened to a lane permute (vperm2f128).
41993 if (VT.is256BitVector() && !Subtarget.hasAVX2() &&
41995 NewPermuteMask) &&
41996 !canScaleShuffleElements(NewPermuteMask, 2))
41997 return SDValue();
41998
41999 SDValue NewBlend =
42000 DAG.getVectorShuffle(VT, DL, DAG.getBitcast(VT, Ops0[0]),
42001 DAG.getBitcast(VT, Ops1[0]), NewBlendMask);
42002 return DAG.getVectorShuffle(VT, DL, NewBlend, DAG.getUNDEF(VT),
42003 NewPermuteMask);
42004}
42005
42006// TODO - move this to TLI like isBinOp?
42007static bool isUnaryOp(unsigned Opcode) {
42008 switch (Opcode) {
42009 case ISD::CTLZ:
42010 case ISD::CTTZ:
42011 case ISD::CTPOP:
42012 return true;
42013 }
42014 return false;
42015}
42016
42017// Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
42018// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
42020 const SDLoc &DL) {
42021 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42022 EVT ShuffleVT = N.getValueType();
42023 unsigned Opc = N.getOpcode();
42024
42025 auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true) {
42026 // AllZeros/AllOnes constants are freely shuffled and will peek through
42027 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
42028 // merge with target shuffles if it has one use so shuffle combining is
42029 // likely to kick in. Shuffles of splats are expected to be removed.
42030 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
42031 ISD::isBuildVectorAllZeros(Op.getNode()) ||
42035 (Op.getOpcode() == Opc && Op->hasOneUse()) ||
42036 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
42037 (Op.getOpcode() == ISD::CONCAT_VECTORS && Op->hasOneUse()) ||
42038 (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
42039 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
42040 };
42041 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
42042 // Ensure we only shuffle whole vector src elements, unless its a logical
42043 // binops where we can more aggressively move shuffles from dst to src.
42044 return isLogicOp(BinOp) ||
42045 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
42046 };
42047
42048 switch (Opc) {
42049 // Unary and Unary+Permute Shuffles.
42050 case X86ISD::PSHUFB: {
42051 // Don't merge PSHUFB if it contains zero'd elements.
42052 SmallVector<int> Mask;
42054 if (!getTargetShuffleMask(N, false, Ops, Mask))
42055 break;
42056 [[fallthrough]];
42057 }
42058 case X86ISD::VBROADCAST:
42059 case X86ISD::MOVDDUP:
42060 case X86ISD::PSHUFD:
42061 case X86ISD::PSHUFHW:
42062 case X86ISD::PSHUFLW:
42063 case X86ISD::VPERMV:
42064 case X86ISD::VPERMI:
42065 case X86ISD::VPERMILPI: {
42066 unsigned SrcIdx = Opc == X86ISD::VPERMV ? 1 : 0;
42067 if (N.getOperand(SrcIdx).getValueType() == ShuffleVT &&
42068 N->isOnlyUserOf(N.getOperand(SrcIdx).getNode())) {
42069 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(SrcIdx));
42070 unsigned SrcOpcode = N0.getOpcode();
42071 EVT OpVT = N0.getValueType();
42072 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
42075 bool FoldShuf = Opc != X86ISD::VPERMI && Opc != X86ISD::VPERMV;
42076 if (IsMergeableWithShuffle(Op00, FoldShuf) ||
42077 IsMergeableWithShuffle(Op01, FoldShuf)) {
42078 SDValue LHS, RHS;
42079 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42080 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42081 if (Opc == X86ISD::VPERMV) {
42082 LHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op00);
42083 RHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op01);
42084 } else if (N.getNumOperands() == 2) {
42085 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
42086 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
42087 } else {
42088 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
42089 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
42090 }
42091 return DAG.getBitcast(ShuffleVT,
42092 DAG.getNode(SrcOpcode, DL, OpVT,
42093 DAG.getBitcast(OpVT, LHS),
42094 DAG.getBitcast(OpVT, RHS)));
42095 }
42096 }
42097 if (SrcOpcode == ISD::SINT_TO_FP && IsSafeToMoveShuffle(N0, SrcOpcode) &&
42098 OpVT.getScalarSizeInBits() ==
42100 SDValue Res = DAG.getBitcast(ShuffleVT, N0.getOperand(0));
42101 if (Opc == X86ISD::VPERMV)
42102 Res = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Res);
42103 else if (N.getNumOperands() == 2)
42104 Res = DAG.getNode(Opc, DL, ShuffleVT, Res, N.getOperand(1));
42105 else
42106 Res = DAG.getNode(Opc, DL, ShuffleVT, Res);
42107 Res = DAG.getBitcast(N0.getOperand(0).getValueType(), Res);
42108 return DAG.getBitcast(ShuffleVT, DAG.getNode(SrcOpcode, DL, OpVT, Res));
42109 }
42110 }
42111 break;
42112 }
42113 // Binary and Binary+Permute Shuffles.
42114 case X86ISD::INSERTPS: {
42115 // Don't merge INSERTPS if it contains zero'd elements.
42116 unsigned InsertPSMask = N.getConstantOperandVal(2);
42117 unsigned ZeroMask = InsertPSMask & 0xF;
42118 if (ZeroMask != 0)
42119 break;
42120 [[fallthrough]];
42121 }
42122 case X86ISD::MOVSD:
42123 case X86ISD::MOVSS:
42124 case X86ISD::BLENDI:
42125 case X86ISD::SHUFP:
42126 case X86ISD::UNPCKH:
42127 case X86ISD::UNPCKL: {
42128 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
42129 N->isOnlyUserOf(N.getOperand(1).getNode())) {
42130 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
42131 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
42132 unsigned SrcOpcode = N0.getOpcode();
42133 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42134 N0.getValueType() == N1.getValueType() &&
42135 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42136 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42141 // Ensure the total number of shuffles doesn't increase by folding this
42142 // shuffle through to the source ops.
42143 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
42144 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
42145 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
42146 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
42147 SDValue LHS, RHS;
42148 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42149 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42150 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42151 Op11 = DAG.getBitcast(ShuffleVT, Op11);
42152 if (N.getNumOperands() == 3) {
42153 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42154 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
42155 } else {
42156 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42157 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
42158 }
42159 EVT OpVT = N0.getValueType();
42160 return DAG.getBitcast(ShuffleVT,
42161 DAG.getNode(SrcOpcode, DL, OpVT,
42162 DAG.getBitcast(OpVT, LHS),
42163 DAG.getBitcast(OpVT, RHS)));
42164 }
42165 }
42166 if (isUnaryOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42167 N0.getValueType() == N1.getValueType() &&
42168 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42169 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42172 SDValue Res;
42173 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42174 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42175 if (N.getNumOperands() == 3) {
42176 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42177 } else {
42178 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42179 }
42180 EVT OpVT = N0.getValueType();
42181 return DAG.getBitcast(
42182 ShuffleVT,
42183 DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res)));
42184 }
42185 // TODO: We can generalize this for other shuffles/conversions.
42186 if (Opc == X86ISD::UNPCKL && SrcOpcode == X86ISD::CVTPH2PS &&
42187 N1.getOpcode() == SrcOpcode &&
42188 N0.getValueType() == N1.getValueType() &&
42189 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
42190 ShuffleVT.getScalarSizeInBits() == N0.getScalarValueSizeInBits() &&
42191 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42192 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42193 EVT OpSrcVT = N0.getOperand(0).getValueType();
42194 EVT OpDstVT = N0.getValueType();
42195 SDValue Res =
42196 DAG.getNode(Opc, DL, OpSrcVT, N0.getOperand(0), N1.getOperand(0));
42197 return DAG.getBitcast(ShuffleVT,
42198 DAG.getNode(SrcOpcode, DL, OpDstVT, Res));
42199 }
42200 }
42201 break;
42202 }
42203 }
42204 return SDValue();
42205}
42206
42207/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
42209 SelectionDAG &DAG,
42210 const SDLoc &DL) {
42211 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
42212
42213 MVT VT = V.getSimpleValueType();
42214 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
42215 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
42216 unsigned SrcOpc0 = Src0.getOpcode();
42217 unsigned SrcOpc1 = Src1.getOpcode();
42218 EVT SrcVT0 = Src0.getValueType();
42219 EVT SrcVT1 = Src1.getValueType();
42220
42221 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
42222 return SDValue();
42223
42224 switch (SrcOpc0) {
42225 case X86ISD::MOVDDUP: {
42226 SDValue LHS = Src0.getOperand(0);
42227 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42228 SDValue Res =
42229 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
42230 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
42231 return DAG.getBitcast(VT, Res);
42232 }
42233 case X86ISD::VPERMILPI:
42234 // TODO: Handle v4f64 permutes with different low/high lane masks.
42235 if (SrcVT0 == MVT::v4f64) {
42236 uint64_t Mask = Src0.getConstantOperandVal(1);
42237 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
42238 break;
42239 }
42240 [[fallthrough]];
42241 case X86ISD::VSHLI:
42242 case X86ISD::VSRLI:
42243 case X86ISD::VSRAI:
42244 case X86ISD::PSHUFD:
42245 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
42246 SDValue LHS = Src0.getOperand(0);
42247 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42248 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
42249 V.getOperand(2));
42250 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
42251 return DAG.getBitcast(VT, Res);
42252 }
42253 break;
42254 }
42255
42256 return SDValue();
42257}
42258
42259/// Try to combine x86 target specific shuffles.
42261 SelectionDAG &DAG,
42263 const X86Subtarget &Subtarget) {
42264 using namespace SDPatternMatch;
42265
42266 MVT VT = N.getSimpleValueType();
42267 unsigned NumElts = VT.getVectorNumElements();
42269 unsigned Opcode = N.getOpcode();
42270 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42271
42272 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
42273 return R;
42274
42275 // Handle specific target shuffles.
42276 switch (Opcode) {
42277 case X86ISD::MOVDDUP: {
42278 SDValue Src = N.getOperand(0);
42279 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
42280 if (VT == MVT::v2f64 && Src.hasOneUse() &&
42281 ISD::isNormalLoad(Src.getNode())) {
42282 LoadSDNode *LN = cast<LoadSDNode>(Src);
42283 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
42284 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
42285 DCI.CombineTo(N.getNode(), Movddup);
42286 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42288 return N; // Return N so it doesn't get rechecked!
42289 }
42290 }
42291
42292 return SDValue();
42293 }
42294 case X86ISD::VBROADCAST: {
42295 SDValue Src = N.getOperand(0);
42296 SDValue BC = peekThroughBitcasts(Src);
42297 EVT SrcVT = Src.getValueType();
42298 EVT BCVT = BC.getValueType();
42299
42300 // If broadcasting from another shuffle, attempt to simplify it.
42301 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
42302 if (isTargetShuffle(BC.getOpcode()) &&
42303 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
42304 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
42305 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
42307 for (unsigned i = 0; i != Scale; ++i)
42308 DemandedMask[i] = i;
42310 {BC}, 0, BC.getOpcode(), BC.getSimpleValueType(), DemandedMask,
42311 {}, /*Depth=*/0, X86::MaxShuffleCombineDepth,
42312 /*AllowVariableCrossLaneMask=*/true,
42313 /*AllowVariablePerLaneMask=*/true,
42314 /*IsMaskedShuffle=*/false, DAG, DL, Subtarget))
42315 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42316 DAG.getBitcast(SrcVT, Res));
42317 }
42318
42319 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
42320 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
42321 if (Src.getOpcode() == ISD::BITCAST &&
42322 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
42323 TLI.isTypeLegal(BCVT) &&
42325 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
42326 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
42328 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42329 }
42330
42331 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
42332 // If we're re-broadcasting a smaller type then broadcast with that type and
42333 // bitcast.
42334 // TODO: Do this for any splat?
42335 if (Src.getOpcode() == ISD::BITCAST &&
42336 (BC.getOpcode() == X86ISD::VBROADCAST ||
42338 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
42339 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
42340 MVT NewVT =
42342 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
42343 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42344 }
42345
42346 // Reduce broadcast source vector to lowest 128-bits.
42347 if (SrcVT.getSizeInBits() > 128)
42348 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42349 extract128BitVector(Src, 0, DAG, DL));
42350
42351 // broadcast(scalar_to_vector(x)) -> broadcast(x).
42352 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42353 Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())
42354 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42355
42356 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
42357 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
42358 isNullConstant(Src.getOperand(1)) &&
42359 Src.getValueType() ==
42360 Src.getOperand(0).getValueType().getScalarType() &&
42361 TLI.isTypeLegal(Src.getOperand(0).getValueType()))
42362 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42363
42364 // Share broadcast with the longest vector and extract low subvector (free).
42365 // Ensure the same SDValue from the SDNode use is being used.
42366 for (SDNode *User : Src->users())
42367 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
42368 Src == User->getOperand(0) &&
42369 User->getValueSizeInBits(0).getFixedValue() >
42370 VT.getFixedSizeInBits()) {
42371 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
42372 VT.getSizeInBits());
42373 }
42374
42375 // vbroadcast(scalarload X) -> vbroadcast_load X
42376 // For float loads, extract other uses of the scalar from the broadcast.
42377 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
42378 ISD::isNormalLoad(Src.getNode())) {
42379 LoadSDNode *LN = cast<LoadSDNode>(Src);
42380 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42381 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42382 SDValue BcastLd =
42384 LN->getMemoryVT(), LN->getMemOperand());
42385 // If the load value is used only by N, replace it via CombineTo N.
42386 bool NoReplaceExtract = Src.hasOneUse();
42387 DCI.CombineTo(N.getNode(), BcastLd);
42388 if (NoReplaceExtract) {
42389 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42391 } else {
42392 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
42393 DAG.getVectorIdxConstant(0, DL));
42394 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
42395 }
42396 return N; // Return N so it doesn't get rechecked!
42397 }
42398
42399 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
42400 // i16. So shrink it ourselves if we can make a broadcast_load.
42401 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
42402 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
42403 assert(Subtarget.hasAVX2() && "Expected AVX2");
42404 SDValue TruncIn = Src.getOperand(0);
42405
42406 // If this is a truncate of a non extending load we can just narrow it to
42407 // use a broadcast_load.
42408 if (ISD::isNormalLoad(TruncIn.getNode())) {
42409 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
42410 // Unless its volatile or atomic.
42411 if (LN->isSimple()) {
42412 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42413 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42414 SDValue BcastLd = DAG.getMemIntrinsicNode(
42415 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42416 LN->getPointerInfo(), LN->getBaseAlign(),
42417 LN->getMemOperand()->getFlags());
42418 DCI.CombineTo(N.getNode(), BcastLd);
42419 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42420 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42421 return N; // Return N so it doesn't get rechecked!
42422 }
42423 }
42424
42425 // If this is a truncate of an i16 extload, we can directly replace it.
42426 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
42427 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
42428 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
42429 if (LN->getMemoryVT().getSizeInBits() == 16) {
42430 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42431 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42432 SDValue BcastLd =
42434 LN->getMemoryVT(), LN->getMemOperand());
42435 DCI.CombineTo(N.getNode(), BcastLd);
42436 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42437 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42438 return N; // Return N so it doesn't get rechecked!
42439 }
42440 }
42441
42442 // If this is a truncate of load that has been shifted right, we can
42443 // offset the pointer and use a narrower load.
42444 if (TruncIn.getOpcode() == ISD::SRL &&
42445 TruncIn.getOperand(0).hasOneUse() &&
42446 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
42447 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
42448 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
42449 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
42450 // Make sure the shift amount and the load size are divisible by 16.
42451 // Don't do this if the load is volatile or atomic.
42452 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
42453 LN->isSimple()) {
42454 unsigned Offset = ShiftAmt / 8;
42455 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42458 SDValue Ops[] = { LN->getChain(), Ptr };
42459 SDValue BcastLd = DAG.getMemIntrinsicNode(
42460 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42462 LN->getMemOperand()->getFlags());
42463 DCI.CombineTo(N.getNode(), BcastLd);
42464 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42465 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42466 return N; // Return N so it doesn't get rechecked!
42467 }
42468 }
42469 }
42470
42471 // vbroadcast(vzload X) -> vbroadcast_load X
42472 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
42474 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
42475 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42476 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42477 SDValue BcastLd =
42479 LN->getMemoryVT(), LN->getMemOperand());
42480 DCI.CombineTo(N.getNode(), BcastLd);
42481 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42483 return N; // Return N so it doesn't get rechecked!
42484 }
42485 }
42486
42487 // vbroadcast(vector load X) -> vbroadcast_load
42488 if (Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
42489 LoadSDNode *LN = cast<LoadSDNode>(Src);
42490 // Unless the load is volatile or atomic.
42491 if (LN->isSimple()) {
42492 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42493 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42494 SDValue BcastLd = DAG.getMemIntrinsicNode(
42496 LN->getPointerInfo(), LN->getBaseAlign(),
42497 LN->getMemOperand()->getFlags());
42498 DCI.CombineTo(N.getNode(), BcastLd);
42499 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42501 return N; // Return N so it doesn't get rechecked!
42502 }
42503 }
42504
42505 return SDValue();
42506 }
42507 case X86ISD::VZEXT_MOVL: {
42508 SDValue N0 = N.getOperand(0);
42509
42510 // Fold (vzmovl (shift x, y)) -> (shift (vzmovl x), y)
42511 // Zeroing out the upper elements means we're just shifting a zero value.
42512 // TODO: Try harder to move vzmovl upward towards SCALAR_TO_VECTOR nodes.
42513 // TODO: Move this to canonicalizeShuffleWithOp once we add zero handling.
42514 if (N0.getOpcode() == X86ISD::VSHL || N0.getOpcode() == X86ISD::VSHLI ||
42515 N0.getOpcode() == X86ISD::VSRL || N0.getOpcode() == X86ISD::VSRLI ||
42516 N0.getOpcode() == X86ISD::VSRA || N0.getOpcode() == X86ISD::VSRAI) {
42517 if (N0.hasOneUse())
42518 return DAG.getNode(
42519 N0.getOpcode(), DL, VT,
42520 DAG.getNode(X86ISD::VZEXT_MOVL, DL, VT, N0.getOperand(0)),
42521 N0.getOperand(1));
42522 }
42523
42524 // If this a vzmovl of a full vector load, replace it with a vzload, unless
42525 // the load is volatile.
42526 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
42527 auto *LN = cast<LoadSDNode>(N0);
42528 if (SDValue VZLoad =
42529 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
42530 DCI.CombineTo(N.getNode(), VZLoad);
42531 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42533 return N;
42534 }
42535 }
42536
42537 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
42538 // and can just use a VZEXT_LOAD.
42539 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
42540 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
42541 auto *LN = cast<MemSDNode>(N0);
42542 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
42543 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42544 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42545 SDValue VZLoad =
42547 LN->getMemoryVT(), LN->getMemOperand());
42548 DCI.CombineTo(N.getNode(), VZLoad);
42549 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42551 return N;
42552 }
42553 }
42554
42555 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
42556 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
42557 // if the upper bits of the i64 are zero.
42558 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42559 N0.getOperand(0).hasOneUse() &&
42560 N0.getOperand(0).getValueType() == MVT::i64) {
42561 SDValue In = N0.getOperand(0);
42562 APInt Mask = APInt::getHighBitsSet(64, 32);
42563 if (DAG.MaskedValueIsZero(In, Mask)) {
42564 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
42565 MVT VecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
42566 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
42567 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
42568 return DAG.getBitcast(VT, Movl);
42569 }
42570 }
42571
42572 // Load a scalar integer constant directly to XMM instead of transferring an
42573 // immediate value from GPR.
42574 // vzext_movl (scalar_to_vector C) --> load [C,0...]
42575 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
42576 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
42577 // Create a vector constant - scalar constant followed by zeros.
42578 EVT ScalarVT = N0.getOperand(0).getValueType();
42579 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
42580 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
42581 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
42582 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
42583
42584 // Load the vector constant from constant pool.
42585 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
42586 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
42587 MachinePointerInfo MPI =
42589 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
42590 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
42592 }
42593 }
42594
42595 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
42596 // insert into a zero vector. This helps get VZEXT_MOVL closer to
42597 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
42598 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
42599 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
42601
42602 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
42603 isNullConstant(V.getOperand(2))) {
42604 SDValue In = V.getOperand(1);
42606 In.getValueSizeInBits() /
42607 VT.getScalarSizeInBits());
42608 In = DAG.getBitcast(SubVT, In);
42609 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
42610 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
42611 getZeroVector(VT, Subtarget, DAG, DL), Movl,
42612 V.getOperand(2));
42613 }
42614 }
42615
42616 return SDValue();
42617 }
42618 case X86ISD::BLENDI: {
42619 SDValue N0 = N.getOperand(0);
42620 SDValue N1 = N.getOperand(1);
42621 unsigned EltBits = VT.getScalarSizeInBits();
42622
42623 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
42624 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
42625 // TODO: Handle MVT::v16i16 repeated blend mask.
42626 if (N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
42627 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
42628 unsigned SrcBits = SrcVT.getScalarSizeInBits();
42629 if ((EltBits % SrcBits) == 0 && SrcBits >= 32) {
42630 unsigned NewSize = SrcVT.getVectorNumElements();
42631 APInt BlendMask = getBLENDIBlendMask(N);
42632 APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
42633 return DAG.getBitcast(
42634 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
42635 N1.getOperand(0),
42636 DAG.getTargetConstant(NewBlendMask.getZExtValue(),
42637 DL, MVT::i8)));
42638 }
42639 }
42640 // Share PSHUFB masks:
42641 // blend(pshufb(x,m1),pshufb(y,m2))
42642 // --> m3 = blend(m1,m2)
42643 // blend(pshufb(x,m3),pshufb(y,m3))
42644 if (N0.hasOneUse() && N1.hasOneUse()) {
42645 SmallVector<int> Mask, ByteMask;
42649 if (LHS.getOpcode() == X86ISD::PSHUFB &&
42650 RHS.getOpcode() == X86ISD::PSHUFB &&
42651 LHS.getOperand(1) != RHS.getOperand(1) &&
42652 LHS.getOperand(1).hasOneUse() && RHS.getOperand(1).hasOneUse() &&
42653 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
42654 assert(Ops.size() == 2 && LHS == peekThroughOneUseBitcasts(Ops[0]) &&
42656 "BLENDI decode mismatch");
42657 MVT ShufVT = LHS.getSimpleValueType();
42658 SDValue MaskLHS = LHS.getOperand(1);
42659 SDValue MaskRHS = RHS.getOperand(1);
42660 llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask);
42662 ShufVT, {MaskLHS, MaskRHS}, ByteMask,
42663 {LHS.getNode(), RHS.getNode()}, DAG, DL, Subtarget)) {
42664 SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42665 LHS.getOperand(0), NewMask);
42666 SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42667 RHS.getOperand(0), NewMask);
42668 return DAG.getNode(X86ISD::BLENDI, DL, VT,
42669 DAG.getBitcast(VT, NewLHS),
42670 DAG.getBitcast(VT, NewRHS), N.getOperand(2));
42671 }
42672 }
42673 }
42674 }
42675 return SDValue();
42676 }
42677 case X86ISD::SHUFP: {
42678 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
42679 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
42680 // TODO: Support types other than v4f32.
42681 if (VT == MVT::v4f32) {
42682 bool Updated = false;
42683 SmallVector<int> Mask;
42685 if (getTargetShuffleMask(N, false, Ops, Mask) && Ops.size() == 2) {
42686 for (int i = 0; i != 2; ++i) {
42687 SmallVector<SDValue> SubOps;
42688 SmallVector<int> SubMask, SubScaledMask;
42690 // TODO: Scaling might be easier if we specify the demanded elts.
42691 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
42692 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
42693 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
42694 int Ofs = i * 2;
42695 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
42696 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
42697 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
42698 Updated = true;
42699 }
42700 }
42701 }
42702 if (Updated) {
42703 for (int &M : Mask)
42704 M %= 4;
42705 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
42706 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
42707 }
42708 }
42709 return SDValue();
42710 }
42711 case X86ISD::VPERMI: {
42712 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
42713 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
42714 SDValue N0 = N.getOperand(0);
42715 SDValue N1 = N.getOperand(1);
42716 unsigned EltSizeInBits = VT.getScalarSizeInBits();
42717 if (N0.getOpcode() == ISD::BITCAST &&
42718 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
42719 SDValue Src = N0.getOperand(0);
42720 EVT SrcVT = Src.getValueType();
42721 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
42722 return DAG.getBitcast(VT, Res);
42723 }
42724 return SDValue();
42725 }
42726 case X86ISD::SHUF128: {
42727 // If we're permuting the upper 256-bits subvectors of a concatenation, then
42728 // see if we can peek through and access the subvector directly.
42729 if (VT.is512BitVector()) {
42730 // 512-bit mask uses 4 x i2 indices - if the msb is always set then only
42731 // the upper subvector is used.
42732 SDValue LHS = peekThroughBitcasts(N->getOperand(0));
42733 SDValue RHS = peekThroughBitcasts(N->getOperand(1));
42734 uint64_t Mask = N->getConstantOperandVal(2);
42735 SmallVector<SDValue> LHSOps, RHSOps;
42736 SDValue NewLHS, NewRHS;
42737 if ((Mask & 0x0A) == 0x0A &&
42738 collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
42739 NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
42740 Mask &= ~0x0A;
42741 }
42742 if ((Mask & 0xA0) == 0xA0 &&
42743 collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
42744 NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
42745 Mask &= ~0xA0;
42746 }
42747 if (NewLHS || NewRHS)
42748 return DAG.getNode(X86ISD::SHUF128, DL, VT,
42749 DAG.getBitcast(VT, NewLHS ? NewLHS : LHS),
42750 DAG.getBitcast(VT, NewRHS ? NewRHS : RHS),
42751 DAG.getTargetConstant(Mask, DL, MVT::i8));
42752 }
42753 return SDValue();
42754 }
42755 case X86ISD::VPERM2X128: {
42756 SDValue LHS = N->getOperand(0);
42757 SDValue RHS = N->getOperand(1);
42758 unsigned Imm = N.getConstantOperandVal(2) & 255;
42759
42760 // Canonicalize unary/repeated operands to LHS.
42761 if (LHS.isUndef() && !RHS.isUndef())
42762 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, RHS, LHS,
42763 DAG.getTargetConstant(Imm ^ 0x22, DL, MVT::i8));
42764 if (LHS == RHS)
42765 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, DAG.getUNDEF(VT),
42766 DAG.getTargetConstant(Imm & ~0x22, DL, MVT::i8));
42767
42768 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
42769 if (LHS.getOpcode() == ISD::BITCAST &&
42770 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
42771 EVT SrcVT = LHS.getOperand(0).getValueType();
42772 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
42773 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
42774 DAG.getBitcast(SrcVT, LHS),
42775 DAG.getBitcast(SrcVT, RHS),
42776 N->getOperand(2)));
42777 }
42778 }
42779
42780 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
42782 return Res;
42783
42784 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
42785 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
42786 auto FindSubVector128 = [&](unsigned Idx) {
42787 if (Idx > 3)
42788 return SDValue();
42789 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
42790 SmallVector<SDValue> SubOps;
42791 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
42792 return SubOps[Idx & 1];
42793 unsigned NumElts = Src.getValueType().getVectorNumElements();
42794 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
42795 Src.getOperand(1).getValueSizeInBits() == 128 &&
42796 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
42797 return Src.getOperand(1);
42798 }
42799 return SDValue();
42800 };
42801 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
42802 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
42803 MVT SubVT = VT.getHalfNumVectorElementsVT();
42804 SubLo = DAG.getBitcast(SubVT, SubLo);
42805 SubHi = DAG.getBitcast(SubVT, SubHi);
42806 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
42807 }
42808 }
42809
42810 // Attempt to match VBROADCAST*128 subvector broadcast load.
42811 if (RHS.isUndef()) {
42813 DecodeVPERM2X128Mask(4, Imm, Mask);
42814 if (isUndefOrInRange(Mask, 0, 4)) {
42815 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, LHS);
42816 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, LHS);
42817 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() &&
42818 X86::mayFoldLoad(LHS, Subtarget, /*AssumeSingleUse=*/true)) {
42819 MVT MemVT = VT.getHalfNumVectorElementsVT();
42820 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
42822 cast<LoadSDNode>(LHS), Ofs, DAG);
42823 }
42824 }
42825 }
42826
42827 return SDValue();
42828 }
42829 case X86ISD::PSHUFD:
42830 case X86ISD::PSHUFLW:
42831 case X86ISD::PSHUFHW: {
42832 SDValue N0 = N.getOperand(0);
42833 SDValue N1 = N.getOperand(1);
42834 if (N0->hasOneUse()) {
42836 switch (V.getOpcode()) {
42837 case X86ISD::VSHL:
42838 case X86ISD::VSRL:
42839 case X86ISD::VSRA:
42840 case X86ISD::VSHLI:
42841 case X86ISD::VSRLI:
42842 case X86ISD::VSRAI:
42843 case X86ISD::VROTLI:
42844 case X86ISD::VROTRI: {
42845 MVT InnerVT = V.getSimpleValueType();
42846 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
42847 SDValue Res = DAG.getNode(Opcode, DL, VT,
42848 DAG.getBitcast(VT, V.getOperand(0)), N1);
42849 Res = DAG.getBitcast(InnerVT, Res);
42850 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
42851 return DAG.getBitcast(VT, Res);
42852 }
42853 break;
42854 }
42855 }
42856 }
42857
42858 Mask = getPSHUFShuffleMask(N);
42859 assert(Mask.size() == 4);
42860 break;
42861 }
42862 case X86ISD::MOVSD:
42863 case X86ISD::MOVSH:
42864 case X86ISD::MOVSS: {
42865 SDValue N0 = N.getOperand(0);
42866 SDValue N1 = N.getOperand(1);
42867
42868 // Canonicalize scalar FPOps:
42869 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
42870 // If commutable, allow OP(N1[0], N0[0]).
42871 unsigned Opcode1 = N1.getOpcode();
42872 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
42873 Opcode1 == ISD::FDIV) {
42874 SDValue N10 = N1.getOperand(0);
42875 SDValue N11 = N1.getOperand(1);
42876 if (N10 == N0 ||
42877 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
42878 if (N10 != N0)
42879 std::swap(N10, N11);
42880 MVT SVT = VT.getVectorElementType();
42881 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
42882 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
42883 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
42884 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
42885 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
42886 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
42887 }
42888 }
42889
42890 return SDValue();
42891 }
42892 case X86ISD::INSERTPS: {
42893 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
42894 SDValue Op0 = N.getOperand(0);
42895 SDValue Op1 = N.getOperand(1);
42896 unsigned InsertPSMask = N.getConstantOperandVal(2);
42897 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
42898 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
42899 unsigned ZeroMask = InsertPSMask & 0xF;
42900
42901 // If we zero out all elements from Op0 then we don't need to reference it.
42902 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
42903 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
42904 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42905
42906 // If we zero out the element from Op1 then we don't need to reference it.
42907 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
42908 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42909 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42910
42911 // Attempt to merge insertps Op1 with an inner target shuffle node.
42912 SmallVector<int, 8> TargetMask1;
42914 APInt KnownUndef1, KnownZero1;
42915 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
42916 KnownZero1)) {
42917 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
42918 // Zero/UNDEF insertion - zero out element and remove dependency.
42919 InsertPSMask |= (1u << DstIdx);
42920 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42921 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42922 }
42923 // Update insertps mask srcidx and reference the source input directly.
42924 int M = TargetMask1[SrcIdx];
42925 assert(0 <= M && M < 8 && "Shuffle index out of range");
42926 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
42927 Op1 = Ops1[M < 4 ? 0 : 1];
42928 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42929 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42930 }
42931
42932 // Attempt to merge insertps Op0 with an inner target shuffle node.
42933 SmallVector<int, 8> TargetMask0;
42935 APInt KnownUndef0, KnownZero0;
42936 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
42937 KnownZero0)) {
42938 bool Updated = false;
42939 bool UseInput00 = false;
42940 bool UseInput01 = false;
42941 for (int i = 0; i != 4; ++i) {
42942 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
42943 // No change if element is already zero or the inserted element.
42944 continue;
42945 }
42946
42947 if (KnownUndef0[i] || KnownZero0[i]) {
42948 // If the target mask is undef/zero then we must zero the element.
42949 InsertPSMask |= (1u << i);
42950 Updated = true;
42951 continue;
42952 }
42953
42954 // The input vector element must be inline.
42955 int M = TargetMask0[i];
42956 if (M != i && M != (i + 4))
42957 return SDValue();
42958
42959 // Determine which inputs of the target shuffle we're using.
42960 UseInput00 |= (0 <= M && M < 4);
42961 UseInput01 |= (4 <= M);
42962 }
42963
42964 // If we're not using both inputs of the target shuffle then use the
42965 // referenced input directly.
42966 if (UseInput00 && !UseInput01) {
42967 Updated = true;
42968 Op0 = Ops0[0];
42969 } else if (!UseInput00 && UseInput01) {
42970 Updated = true;
42971 Op0 = Ops0[1];
42972 }
42973
42974 if (Updated)
42975 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42976 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42977 }
42978
42979 // If we're inserting an element from a vbroadcast load, fold the
42980 // load into the X86insertps instruction. We need to convert the scalar
42981 // load to a vector and clear the source lane of the INSERTPS control.
42982 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
42983 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
42984 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
42985 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
42986 MemIntr->getBasePtr(),
42987 MemIntr->getMemOperand());
42988 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
42990 Load),
42991 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
42992 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
42993 return Insert;
42994 }
42995 }
42996
42997 return SDValue();
42998 }
42999 case X86ISD::VPERMV: {
43000 // Combine VPERMV to VPERMV3 if the source operand can be freely split.
43002 SmallVector<SDValue, 2> SrcOps, SubOps;
43003 SDValue Src = peekThroughBitcasts(N.getOperand(1));
43004 if ((Subtarget.hasVLX() || VT.is512BitVector()) &&
43005 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask) &&
43006 collectConcatOps(Src.getNode(), SubOps, DAG)) {
43007 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
43008 assert(SrcOps.size() == 1 && "Unexpected shuffle ops");
43009 assert((SubOps.size() == 2 || SubOps.size() == 4) &&
43010 "Unexpected split ops");
43011 // Bail if we were permuting a widened vector.
43012 if (SubOps[1].isUndef() &&
43013 (SubOps.size() == 2 || (SubOps[2].isUndef() && SubOps[3].isUndef())))
43014 return SDValue();
43015 // Bail if any subops would have folded into the concat.
43016 if (any_of(SubOps, isShuffleFoldableLoad))
43017 return SDValue();
43018 // Concat 4x128 back to 2x256.
43019 if (SubOps.size() == 4) {
43020 SubOps[0] = concatSubVectors(SubOps[0], SubOps[1], DAG, DL);
43021 SubOps[1] = concatSubVectors(SubOps[2], SubOps[3], DAG, DL);
43022 }
43023 // Convert mask to 2 operand shuffle.
43024 int HalfElts = NumElts / 2;
43025 for (int &M : Mask)
43026 M += M >= HalfElts ? HalfElts : 0;
43027 SDValue Lo = widenSubVector(SubOps[0], false, Subtarget, DAG, DL,
43028 VT.getSizeInBits());
43029 SDValue Hi = widenSubVector(SubOps[1], false, Subtarget, DAG, DL,
43030 VT.getSizeInBits());
43031 return lowerShuffleWithPERMV(DL, VT, Mask, DAG.getBitcast(VT, Lo),
43032 DAG.getBitcast(VT, Hi), Subtarget, DAG);
43033 }
43034 return SDValue();
43035 }
43036 case X86ISD::VPERMV3: {
43037 MVT WideVT = VT.getDoubleNumVectorElementsVT();
43038 bool CanConcat = VT.is128BitVector() ||
43039 (VT.is256BitVector() && Subtarget.useAVX512Regs());
43042 if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask)) {
43043 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
43044 SDValue V1 = peekThroughBitcasts(N.getOperand(0));
43045 SDValue V2 = peekThroughBitcasts(N.getOperand(2));
43046 // Canonicalize to VPERMV if both sources are the same.
43047 if (V1 == V2) {
43048 for (int &M : Mask)
43049 M = (M < 0 ? M : (M & (NumElts - 1)));
43050 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(0),
43051 DAG.getUNDEF(VT), Subtarget, DAG);
43052 }
43053 // If sources are half width, then concat and use VPERMV with adjusted
43054 // mask.
43055 SDValue Ops[2];
43056 MVT HalfVT = VT.getHalfNumVectorElementsVT();
43057 if (sd_match(V1,
43059 sd_match(V2,
43061 Ops[0].getValueType() == HalfVT && Ops[1].getValueType() == HalfVT) {
43062 if (SDValue ConcatSrc =
43063 combineConcatVectorOps(DL, VT, Ops, DAG, Subtarget)) {
43064 for (int &M : Mask)
43065 M = (M < (int)NumElts ? M : (M - (NumElts / 2)));
43066 return lowerShuffleWithPERMV(DL, VT, Mask, ConcatSrc,
43067 DAG.getUNDEF(VT), Subtarget, DAG);
43068 }
43069 }
43070 // Commute foldable source to the RHS.
43071 if (isShuffleFoldableLoad(N.getOperand(0)) &&
43072 !isShuffleFoldableLoad(N.getOperand(2))) {
43074 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(2),
43075 N.getOperand(0), Subtarget, DAG);
43076 }
43077 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43078 // freely concatenated, with a commuted shuffle mask.
43079 if (CanConcat) {
43080 if (SDValue ConcatSrc = combineConcatVectorOps(
43081 DL, WideVT, {N.getOperand(2), N.getOperand(0)}, DAG,
43082 Subtarget)) {
43084 Mask.append(NumElts, SM_SentinelUndef);
43085 SDValue Perm =
43086 lowerShuffleWithPERMV(DL, WideVT, Mask, ConcatSrc,
43087 DAG.getUNDEF(WideVT), Subtarget, DAG);
43088 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43089 DAG.getVectorIdxConstant(0, DL));
43090 }
43091 }
43092 }
43093 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43094 // freely concatenated.
43095 if (CanConcat) {
43096 if (SDValue ConcatSrc = combineConcatVectorOps(
43097 DL, WideVT, {N.getOperand(0), N.getOperand(2)}, DAG, Subtarget)) {
43098 SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
43099 DL, WideVT.getSizeInBits());
43100 SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc);
43101 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43102 DAG.getVectorIdxConstant(0, DL));
43103 }
43104 }
43105 return SDValue();
43106 }
43107 default:
43108 return SDValue();
43109 }
43110
43111 // Nuke no-op shuffles that show up after combining.
43112 if (isNoopShuffleMask(Mask))
43113 return N.getOperand(0);
43114
43115 // Look for simplifications involving one or two shuffle instructions.
43116 SDValue V = N.getOperand(0);
43117 switch (N.getOpcode()) {
43118 default:
43119 break;
43120 case X86ISD::PSHUFLW:
43121 case X86ISD::PSHUFHW:
43122 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
43123
43124 // See if this reduces to a PSHUFD which is no more expensive and can
43125 // combine with more operations. Note that it has to at least flip the
43126 // dwords as otherwise it would have been removed as a no-op.
43127 if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {
43128 int DMask[] = {0, 1, 2, 3};
43129 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
43130 DMask[DOffset + 0] = DOffset + 1;
43131 DMask[DOffset + 1] = DOffset + 0;
43132 MVT DVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
43133 V = DAG.getBitcast(DVT, V);
43134 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
43135 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
43136 return DAG.getBitcast(VT, V);
43137 }
43138
43139 // Look for shuffle patterns which can be implemented as a single unpack.
43140 // FIXME: This doesn't handle the location of the PSHUFD generically, and
43141 // only works when we have a PSHUFD followed by two half-shuffles.
43142 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
43143 (V.getOpcode() == X86ISD::PSHUFLW ||
43144 V.getOpcode() == X86ISD::PSHUFHW) &&
43145 V.getOpcode() != N.getOpcode() &&
43146 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
43147 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
43148 if (D.getOpcode() == X86ISD::PSHUFD) {
43151 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43152 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43153 int WordMask[8];
43154 for (int i = 0; i < 4; ++i) {
43155 WordMask[i + NOffset] = Mask[i] + NOffset;
43156 WordMask[i + VOffset] = VMask[i] + VOffset;
43157 }
43158 // Map the word mask through the DWord mask.
43159 int MappedMask[8];
43160 for (int i = 0; i < 8; ++i)
43161 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
43162 if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
43163 ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
43164 // We can replace all three shuffles with an unpack.
43165 V = DAG.getBitcast(VT, D.getOperand(0));
43166 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
43168 DL, VT, V, V);
43169 }
43170 }
43171 }
43172
43173 break;
43174
43175 case X86ISD::PSHUFD:
43176 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DL, DAG))
43177 return NewN;
43178
43179 break;
43180 }
43181
43182 return SDValue();
43183}
43184
43185/// Checks if the shuffle mask takes subsequent elements
43186/// alternately from two vectors.
43187/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
43188static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
43189
43190 int ParitySrc[2] = {-1, -1};
43191 unsigned Size = Mask.size();
43192 for (unsigned i = 0; i != Size; ++i) {
43193 int M = Mask[i];
43194 if (M < 0)
43195 continue;
43196
43197 // Make sure we are using the matching element from the input.
43198 if ((M % Size) != i)
43199 return false;
43200
43201 // Make sure we use the same input for all elements of the same parity.
43202 int Src = M / Size;
43203 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
43204 return false;
43205 ParitySrc[i % 2] = Src;
43206 }
43207
43208 // Make sure each input is used.
43209 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
43210 return false;
43211
43212 Op0Even = ParitySrc[0] == 0;
43213 return true;
43214}
43215
43216/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
43217/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
43218/// are written to the parameters \p Opnd0 and \p Opnd1.
43219///
43220/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
43221/// so it is easier to generically match. We also insert dummy vector shuffle
43222/// nodes for the operands which explicitly discard the lanes which are unused
43223/// by this operation to try to flow through the rest of the combiner
43224/// the fact that they're unused.
43225static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
43226 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
43227 bool &IsSubAdd, bool &HasAllowContract) {
43228
43229 EVT VT = N->getValueType(0);
43230 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43231 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
43233 return false;
43234
43235 // We only handle target-independent shuffles.
43236 // FIXME: It would be easy and harmless to use the target shuffle mask
43237 // extraction tool to support more.
43238 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43239 return false;
43240
43241 SDValue V1 = N->getOperand(0);
43242 SDValue V2 = N->getOperand(1);
43243
43244 // Make sure we have an FADD and an FSUB.
43245 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
43246 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
43247 V1.getOpcode() == V2.getOpcode())
43248 return false;
43249
43250 // If there are other uses of these operations we can't fold them.
43251 if (!V1->hasOneUse() || !V2->hasOneUse())
43252 return false;
43253
43254 // Ensure that both operations have the same operands. Note that we can
43255 // commute the FADD operands.
43256 SDValue LHS, RHS;
43257 if (V1.getOpcode() == ISD::FSUB) {
43258 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
43259 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
43260 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
43261 return false;
43262 } else {
43263 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
43264 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
43265 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
43266 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
43267 return false;
43268 }
43269
43270 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43271 bool Op0Even;
43272 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43273 return false;
43274
43275 // It's a subadd if the vector in the even parity is an FADD.
43276 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
43277 : V2->getOpcode() == ISD::FADD;
43278 HasAllowContract =
43280
43281 Opnd0 = LHS;
43282 Opnd1 = RHS;
43283 return true;
43284}
43285
43286/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
43288 const X86Subtarget &Subtarget,
43289 SelectionDAG &DAG) {
43290 // We only handle target-independent shuffles.
43291 // FIXME: It would be easy and harmless to use the target shuffle mask
43292 // extraction tool to support more.
43293 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43294 return SDValue();
43295
43296 MVT VT = N->getSimpleValueType(0);
43297 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43298 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
43299 return SDValue();
43300
43301 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
43302 SDValue Op0 = N->getOperand(0);
43303 SDValue Op1 = N->getOperand(1);
43304 SDValue FMAdd = Op0, FMSub = Op1;
43305 if (FMSub.getOpcode() != X86ISD::FMSUB)
43306 std::swap(FMAdd, FMSub);
43307
43308 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
43309 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
43310 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
43311 FMAdd.getOperand(2) != FMSub.getOperand(2))
43312 return SDValue();
43313
43314 // Check for correct shuffle mask.
43315 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43316 bool Op0Even;
43317 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43318 return SDValue();
43319
43320 // FMAddSub takes zeroth operand from FMSub node.
43321 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
43322 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43323 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
43324 FMAdd.getOperand(2));
43325}
43326
43327/// Try to combine a shuffle into a target-specific add-sub or
43328/// mul-add-sub node.
43330 const X86Subtarget &Subtarget,
43331 SelectionDAG &DAG) {
43332 if (SDValue V = combineShuffleToFMAddSub(N, DL, Subtarget, DAG))
43333 return V;
43334
43335 SDValue Opnd0, Opnd1;
43336 bool IsSubAdd;
43337 bool HasAllowContract;
43338 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd,
43339 HasAllowContract))
43340 return SDValue();
43341
43342 MVT VT = N->getSimpleValueType(0);
43343
43344 // Try to generate X86ISD::FMADDSUB node here.
43345 SDValue Opnd2;
43346 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2,
43347 HasAllowContract)) {
43348 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43349 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
43350 }
43351
43352 if (IsSubAdd)
43353 return SDValue();
43354
43355 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
43356 // the ADDSUB idiom has been successfully recognized. There are no known
43357 // X86 targets with 512-bit ADDSUB instructions!
43358 if (VT.is512BitVector())
43359 return SDValue();
43360
43361 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
43362 // the ADDSUB idiom has been successfully recognized. There are no known
43363 // X86 targets with FP16 ADDSUB instructions!
43364 if (VT.getVectorElementType() == MVT::f16)
43365 return SDValue();
43366
43367 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
43368}
43369
43370/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
43371/// low half of each source vector and does not set any high half elements in
43372/// the destination vector, narrow the shuffle to half its original size.
43374 EVT VT = Shuf->getValueType(0);
43375 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
43376 return SDValue();
43377 if (!VT.is256BitVector() && !VT.is512BitVector())
43378 return SDValue();
43379
43380 // See if we can ignore all of the high elements of the shuffle.
43381 ArrayRef<int> Mask = Shuf->getMask();
43382 if (!isUndefUpperHalf(Mask))
43383 return SDValue();
43384
43385 // Check if the shuffle mask accesses only the low half of each input vector
43386 // (half-index output is 0 or 2).
43387 int HalfIdx1, HalfIdx2;
43388 SmallVector<int, 8> HalfMask(Mask.size() / 2);
43389 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
43390 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
43391 return SDValue();
43392
43393 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
43394 // The trick is knowing that all of the insert/extract are actually free
43395 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
43396 // of narrow inputs into a narrow output, and that is always cheaper than
43397 // the wide shuffle that we started with.
43398 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
43399 Shuf->getOperand(1), HalfMask, HalfIdx1,
43400 HalfIdx2, false, DAG, /*UseConcat*/ true);
43401}
43402
43405 const X86Subtarget &Subtarget) {
43406 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
43407 if (SDValue V = narrowShuffle(Shuf, DAG))
43408 return V;
43409
43410 // If we have legalized the vector types, look for blends of FADD and FSUB
43411 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
43412 SDLoc dl(N);
43413 EVT VT = N->getValueType(0);
43414 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43415 if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))
43416 if (SDValue AddSub =
43417 combineShuffleToAddSubOrFMAddSub(N, dl, Subtarget, DAG))
43418 return AddSub;
43419
43420 // Attempt to combine into a vector load/broadcast.
43422 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
43423 return LD;
43424
43425 if (isTargetShuffle(N->getOpcode())) {
43426 SDValue Op(N, 0);
43427 if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
43428 return Shuffle;
43429
43430 // Try recursively combining arbitrary sequences of x86 shuffle
43431 // instructions into higher-order shuffles. We do this after combining
43432 // specific PSHUF instruction sequences into their minimal form so that we
43433 // can evaluate how many specialized shuffle instructions are involved in
43434 // a particular chain.
43435 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
43436 return Res;
43437
43438 // Simplify source operands based on shuffle mask.
43439 // TODO - merge this into combineX86ShufflesRecursively.
43440 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
43441 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
43442 return SDValue(N, 0);
43443
43444 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
43445 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
43446 // Perform this after other shuffle combines to allow inner shuffles to be
43447 // combined away first.
43448 if (SDValue BinOp = canonicalizeShuffleWithOp(Op, DAG, dl))
43449 return BinOp;
43450 }
43451
43452 return SDValue();
43453}
43454
43455// Simplify variable target shuffle masks based on the demanded elements.
43456// TODO: Handle DemandedBits in mask indices as well?
43458 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
43459 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
43460 // If we're demanding all elements don't bother trying to simplify the mask.
43461 unsigned NumElts = DemandedElts.getBitWidth();
43462 if (DemandedElts.isAllOnes())
43463 return false;
43464
43465 SDValue Mask = Op.getOperand(MaskIndex);
43466 if (!Mask.hasOneUse())
43467 return false;
43468
43469 // Attempt to generically simplify the variable shuffle mask.
43470 APInt MaskUndef, MaskZero;
43471 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
43472 Depth + 1))
43473 return true;
43474
43475 // Attempt to extract+simplify a (constant pool load) shuffle mask.
43476 // TODO: Support other types from getTargetShuffleMaskIndices?
43478 EVT BCVT = BC.getValueType();
43479 auto *Load = dyn_cast<LoadSDNode>(BC);
43480 if (!Load || !Load->getBasePtr().hasOneUse())
43481 return false;
43482
43483 const Constant *C = getTargetConstantFromNode(Load);
43484 if (!C)
43485 return false;
43486
43487 Type *CTy = C->getType();
43488 if (!CTy->isVectorTy() ||
43489 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
43490 return false;
43491
43492 // Handle scaling for i64 elements on 32-bit targets.
43493 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
43494 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
43495 return false;
43496 unsigned Scale = NumCstElts / NumElts;
43497
43498 // Simplify mask if we have an undemanded element that is not undef.
43499 bool Simplified = false;
43500 SmallVector<Constant *, 32> ConstVecOps;
43501 for (unsigned i = 0; i != NumCstElts; ++i) {
43502 Constant *Elt = C->getAggregateElement(i);
43503 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
43504 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
43505 Simplified = true;
43506 continue;
43507 }
43508 ConstVecOps.push_back(Elt);
43509 }
43510 if (!Simplified)
43511 return false;
43512
43513 // Generate new constant pool entry + legalize immediately for the load.
43514 SDLoc DL(Op);
43515 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
43516 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
43517 SDValue NewMask = TLO.DAG.getLoad(
43518 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
43520 Load->getAlign());
43521 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
43522}
43523
43525 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
43526 TargetLoweringOpt &TLO, unsigned Depth) const {
43527 int NumElts = DemandedElts.getBitWidth();
43528 unsigned Opc = Op.getOpcode();
43529 EVT VT = Op.getValueType();
43530
43531 // Handle special case opcodes.
43532 switch (Opc) {
43533 case X86ISD::PMULDQ:
43534 case X86ISD::PMULUDQ: {
43535 APInt LHSUndef, LHSZero;
43536 APInt RHSUndef, RHSZero;
43537 SDValue LHS = Op.getOperand(0);
43538 SDValue RHS = Op.getOperand(1);
43539 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43540 Depth + 1))
43541 return true;
43542 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43543 Depth + 1))
43544 return true;
43545 // Multiply by zero.
43546 KnownZero = LHSZero | RHSZero;
43547 break;
43548 }
43549 case X86ISD::VPMADDUBSW:
43550 case X86ISD::VPMADDWD: {
43551 APInt LHSUndef, LHSZero;
43552 APInt RHSUndef, RHSZero;
43553 SDValue LHS = Op.getOperand(0);
43554 SDValue RHS = Op.getOperand(1);
43555 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
43556
43557 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
43558 Depth + 1))
43559 return true;
43560 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
43561 Depth + 1))
43562 return true;
43563
43564 // TODO: Multiply by zero.
43565
43566 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
43567 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
43568 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
43569 Depth + 1))
43570 return true;
43571 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
43572 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
43573 Depth + 1))
43574 return true;
43575 break;
43576 }
43577 case X86ISD::PSADBW: {
43578 SDValue LHS = Op.getOperand(0);
43579 SDValue RHS = Op.getOperand(1);
43580 assert(VT.getScalarType() == MVT::i64 &&
43581 LHS.getValueType() == RHS.getValueType() &&
43582 LHS.getValueType().getScalarType() == MVT::i8 &&
43583 "Unexpected PSADBW types");
43584
43585 // Aggressively peek through ops to get at the demanded elts.
43586 if (!DemandedElts.isAllOnes()) {
43587 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
43588 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
43590 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43592 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43593 if (NewLHS || NewRHS) {
43594 NewLHS = NewLHS ? NewLHS : LHS;
43595 NewRHS = NewRHS ? NewRHS : RHS;
43596 return TLO.CombineTo(
43597 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43598 }
43599 }
43600 break;
43601 }
43602 case X86ISD::VSHL:
43603 case X86ISD::VSRL:
43604 case X86ISD::VSRA: {
43605 // We only need the bottom 64-bits of the (128-bit) shift amount.
43606 SDValue Amt = Op.getOperand(1);
43607 MVT AmtVT = Amt.getSimpleValueType();
43608 assert(AmtVT.is128BitVector() && "Unexpected value type");
43609
43610 // If we reuse the shift amount just for sse shift amounts then we know that
43611 // only the bottom 64-bits are only ever used.
43612 bool AssumeSingleUse = llvm::all_of(Amt->users(), [&Amt](SDNode *Use) {
43613 unsigned UseOpc = Use->getOpcode();
43614 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
43615 UseOpc == X86ISD::VSRA) &&
43616 Use->getOperand(0) != Amt;
43617 });
43618
43619 APInt AmtUndef, AmtZero;
43620 unsigned NumAmtElts = AmtVT.getVectorNumElements();
43621 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
43622 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
43623 Depth + 1, AssumeSingleUse))
43624 return true;
43625 [[fallthrough]];
43626 }
43627 case X86ISD::VSHLI:
43628 case X86ISD::VSRLI:
43629 case X86ISD::VSRAI: {
43630 SDValue Src = Op.getOperand(0);
43631 APInt SrcUndef;
43632 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
43633 Depth + 1))
43634 return true;
43635
43636 // Fold shift(0,x) -> 0
43637 if (DemandedElts.isSubsetOf(KnownZero))
43638 return TLO.CombineTo(
43639 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43640
43641 // Aggressively peek through ops to get at the demanded elts.
43642 if (!DemandedElts.isAllOnes())
43644 Src, DemandedElts, TLO.DAG, Depth + 1))
43645 return TLO.CombineTo(
43646 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
43647 break;
43648 }
43649 case X86ISD::VPSHA:
43650 case X86ISD::VPSHL:
43651 case X86ISD::VSHLV:
43652 case X86ISD::VSRLV:
43653 case X86ISD::VSRAV: {
43654 APInt LHSUndef, LHSZero;
43655 APInt RHSUndef, RHSZero;
43656 SDValue LHS = Op.getOperand(0);
43657 SDValue RHS = Op.getOperand(1);
43658 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43659 Depth + 1))
43660 return true;
43661
43662 // Fold shift(0,x) -> 0
43663 if (DemandedElts.isSubsetOf(LHSZero))
43664 return TLO.CombineTo(
43665 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43666
43667 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43668 Depth + 1))
43669 return true;
43670
43671 KnownZero = LHSZero;
43672 break;
43673 }
43674 case X86ISD::CMPM:
43675 case X86ISD::CMPP: {
43676 // Scalarize packed fp comparison if we only require element 0.
43677 if (DemandedElts == 1) {
43678 SDLoc dl(Op);
43679 MVT VT = Op.getSimpleValueType();
43680 MVT OpSVT = Op.getOperand(0).getSimpleValueType().getScalarType();
43681 SDValue LHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(0), 0);
43682 SDValue RHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(1), 0);
43683 SDValue CC = Op.getOperand(2);
43684 if (Opc == X86ISD::CMPM) {
43685 SDValue Cmp =
43686 TLO.DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, CC);
43687 return TLO.CombineTo(
43688 Op, TLO.DAG.getInsertSubvector(dl, TLO.DAG.getUNDEF(VT), Cmp, 0));
43689 }
43690 SDValue Cmp = TLO.DAG.getNode(X86ISD::FSETCC, dl, OpSVT, LHS, RHS, CC);
43691 return TLO.CombineTo(Op,
43692 TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Cmp));
43693 }
43694 break;
43695 }
43696 case X86ISD::PCMPEQ:
43697 case X86ISD::PCMPGT: {
43698 APInt LHSUndef, LHSZero;
43699 APInt RHSUndef, RHSZero;
43700 SDValue LHS = Op.getOperand(0);
43701 SDValue RHS = Op.getOperand(1);
43702 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43703 Depth + 1))
43704 return true;
43705 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43706 Depth + 1))
43707 return true;
43708 break;
43709 }
43710 case X86ISD::KSHIFTL: {
43711 SDValue Src = Op.getOperand(0);
43712 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43713 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43714 unsigned ShiftAmt = Amt->getZExtValue();
43715
43716 if (ShiftAmt == 0)
43717 return TLO.CombineTo(Op, Src);
43718
43719 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43720 // single shift. We can do this if the bottom bits (which are shifted
43721 // out) are never demanded.
43722 if (Src.getOpcode() == X86ISD::KSHIFTR) {
43723 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
43724 unsigned C1 = Src.getConstantOperandVal(1);
43725 unsigned NewOpc = X86ISD::KSHIFTL;
43726 int Diff = ShiftAmt - C1;
43727 if (Diff < 0) {
43728 Diff = -Diff;
43729 NewOpc = X86ISD::KSHIFTR;
43730 }
43731
43732 SDLoc dl(Op);
43733 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43734 return TLO.CombineTo(
43735 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43736 }
43737 }
43738
43739 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
43740 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43741 Depth + 1))
43742 return true;
43743
43744 KnownUndef <<= ShiftAmt;
43745 KnownZero <<= ShiftAmt;
43746 KnownZero.setLowBits(ShiftAmt);
43747 break;
43748 }
43749 case X86ISD::KSHIFTR: {
43750 SDValue Src = Op.getOperand(0);
43751 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43752 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43753 unsigned ShiftAmt = Amt->getZExtValue();
43754
43755 if (ShiftAmt == 0)
43756 return TLO.CombineTo(Op, Src);
43757
43758 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
43759 // single shift. We can do this if the top bits (which are shifted
43760 // out) are never demanded.
43761 if (Src.getOpcode() == X86ISD::KSHIFTL) {
43762 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
43763 unsigned C1 = Src.getConstantOperandVal(1);
43764 unsigned NewOpc = X86ISD::KSHIFTR;
43765 int Diff = ShiftAmt - C1;
43766 if (Diff < 0) {
43767 Diff = -Diff;
43768 NewOpc = X86ISD::KSHIFTL;
43769 }
43770
43771 SDLoc dl(Op);
43772 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43773 return TLO.CombineTo(
43774 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43775 }
43776 }
43777
43778 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
43779 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43780 Depth + 1))
43781 return true;
43782
43783 KnownUndef.lshrInPlace(ShiftAmt);
43784 KnownZero.lshrInPlace(ShiftAmt);
43785 KnownZero.setHighBits(ShiftAmt);
43786 break;
43787 }
43788 case X86ISD::ANDNP: {
43789 // ANDNP = (~LHS & RHS);
43790 SDValue LHS = Op.getOperand(0);
43791 SDValue RHS = Op.getOperand(1);
43792
43793 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
43794 APInt UndefElts;
43795 SmallVector<APInt> EltBits;
43796 int NumElts = VT.getVectorNumElements();
43797 int EltSizeInBits = VT.getScalarSizeInBits();
43798 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
43799 APInt OpElts = DemandedElts;
43800 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
43801 EltBits)) {
43802 OpBits.clearAllBits();
43803 OpElts.clearAllBits();
43804 for (int I = 0; I != NumElts; ++I) {
43805 if (!DemandedElts[I])
43806 continue;
43807 if (UndefElts[I]) {
43808 // We can't assume an undef src element gives an undef dst - the
43809 // other src might be zero.
43810 OpBits.setAllBits();
43811 OpElts.setBit(I);
43812 } else if ((Invert && !EltBits[I].isAllOnes()) ||
43813 (!Invert && !EltBits[I].isZero())) {
43814 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
43815 OpElts.setBit(I);
43816 }
43817 }
43818 }
43819 return std::make_pair(OpBits, OpElts);
43820 };
43821 APInt BitsLHS, EltsLHS;
43822 APInt BitsRHS, EltsRHS;
43823 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
43824 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
43825
43826 APInt LHSUndef, LHSZero;
43827 APInt RHSUndef, RHSZero;
43828 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
43829 Depth + 1))
43830 return true;
43831 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
43832 Depth + 1))
43833 return true;
43834
43835 if (!DemandedElts.isAllOnes()) {
43836 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
43837 TLO.DAG, Depth + 1);
43838 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
43839 TLO.DAG, Depth + 1);
43840 if (NewLHS || NewRHS) {
43841 NewLHS = NewLHS ? NewLHS : LHS;
43842 NewRHS = NewRHS ? NewRHS : RHS;
43843 return TLO.CombineTo(
43844 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43845 }
43846 }
43847 break;
43848 }
43849 case X86ISD::CVTSI2P:
43850 case X86ISD::CVTUI2P:
43851 case X86ISD::CVTPH2PS:
43852 case X86ISD::CVTPS2PH: {
43853 SDValue Src = Op.getOperand(0);
43854 EVT SrcVT = Src.getValueType();
43855 APInt SrcUndef, SrcZero;
43856 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43857 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43858 Depth + 1))
43859 return true;
43860 break;
43861 }
43862 case X86ISD::PACKSS:
43863 case X86ISD::PACKUS: {
43864 SDValue N0 = Op.getOperand(0);
43865 SDValue N1 = Op.getOperand(1);
43866
43867 APInt DemandedLHS, DemandedRHS;
43868 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43869
43870 APInt LHSUndef, LHSZero;
43871 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43872 Depth + 1))
43873 return true;
43874 APInt RHSUndef, RHSZero;
43875 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43876 Depth + 1))
43877 return true;
43878
43879 // TODO - pass on known zero/undef.
43880
43881 // Aggressively peek through ops to get at the demanded elts.
43882 // TODO - we should do this for all target/faux shuffles ops.
43883 if (!DemandedElts.isAllOnes()) {
43884 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43885 TLO.DAG, Depth + 1);
43886 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43887 TLO.DAG, Depth + 1);
43888 if (NewN0 || NewN1) {
43889 NewN0 = NewN0 ? NewN0 : N0;
43890 NewN1 = NewN1 ? NewN1 : N1;
43891 return TLO.CombineTo(Op,
43892 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43893 }
43894 }
43895 break;
43896 }
43897 case X86ISD::HADD:
43898 case X86ISD::HSUB:
43899 case X86ISD::FHADD:
43900 case X86ISD::FHSUB: {
43901 SDValue N0 = Op.getOperand(0);
43902 SDValue N1 = Op.getOperand(1);
43903
43904 APInt DemandedLHS, DemandedRHS;
43905 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43906
43907 APInt LHSUndef, LHSZero;
43908 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43909 Depth + 1))
43910 return true;
43911 APInt RHSUndef, RHSZero;
43912 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43913 Depth + 1))
43914 return true;
43915
43916 // TODO - pass on known zero/undef.
43917
43918 // Aggressively peek through ops to get at the demanded elts.
43919 // TODO: Handle repeated operands.
43920 if (N0 != N1 && !DemandedElts.isAllOnes()) {
43921 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43922 TLO.DAG, Depth + 1);
43923 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43924 TLO.DAG, Depth + 1);
43925 if (NewN0 || NewN1) {
43926 NewN0 = NewN0 ? NewN0 : N0;
43927 NewN1 = NewN1 ? NewN1 : N1;
43928 return TLO.CombineTo(Op,
43929 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43930 }
43931 }
43932 break;
43933 }
43934 case X86ISD::VTRUNC:
43935 case X86ISD::VTRUNCS:
43936 case X86ISD::VTRUNCUS: {
43937 SDValue Src = Op.getOperand(0);
43938 MVT SrcVT = Src.getSimpleValueType();
43939 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43940 APInt SrcUndef, SrcZero;
43941 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
43942 Depth + 1))
43943 return true;
43944 KnownZero = SrcZero.zextOrTrunc(NumElts);
43945 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
43946 break;
43947 }
43948 case X86ISD::BLENDI: {
43949 SmallVector<int, 16> BlendMask;
43950 DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
43952 VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask,
43953 DemandedElts, TLO.DAG, Subtarget, SDLoc(Op)))
43954 return TLO.CombineTo(Op, R);
43955 break;
43956 }
43957 case X86ISD::BLENDV: {
43958 APInt SelUndef, SelZero;
43959 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
43960 SelZero, TLO, Depth + 1))
43961 return true;
43962
43963 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
43964 APInt LHSUndef, LHSZero;
43965 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
43966 LHSZero, TLO, Depth + 1))
43967 return true;
43968
43969 APInt RHSUndef, RHSZero;
43970 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
43971 RHSZero, TLO, Depth + 1))
43972 return true;
43973
43974 KnownZero = LHSZero & RHSZero;
43975 KnownUndef = LHSUndef & RHSUndef;
43976 break;
43977 }
43978 case X86ISD::VZEXT_MOVL: {
43979 // If upper demanded elements are already zero then we have nothing to do.
43980 SDValue Src = Op.getOperand(0);
43981 APInt DemandedUpperElts = DemandedElts;
43982 DemandedUpperElts.clearLowBits(1);
43983 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
43984 return TLO.CombineTo(Op, Src);
43985 break;
43986 }
43987 case X86ISD::VZEXT_LOAD: {
43988 // If upper demanded elements are not demanded then simplify to a
43989 // scalar_to_vector(load()).
43991 if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {
43992 SDLoc DL(Op);
43993 auto *Mem = cast<MemSDNode>(Op);
43994 SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
43995 Mem->getMemOperand());
43996 SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);
43997 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));
43998 }
43999 break;
44000 }
44001 case X86ISD::VBROADCAST: {
44002 SDValue Src = Op.getOperand(0);
44003 MVT SrcVT = Src.getSimpleValueType();
44004 // Don't bother broadcasting if we just need the 0'th element.
44005 if (DemandedElts == 1) {
44006 if (!SrcVT.isVector())
44007 Src = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op), VT, Src);
44008 else if (Src.getValueType() != VT)
44009 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
44010 SDLoc(Op));
44011 return TLO.CombineTo(Op, Src);
44012 }
44013 if (!SrcVT.isVector())
44014 break;
44015 APInt SrcUndef, SrcZero;
44016 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
44017 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
44018 Depth + 1))
44019 return true;
44020 // Aggressively peek through src to get at the demanded elt.
44021 // TODO - we should do this for all target/faux shuffles ops.
44023 Src, SrcElts, TLO.DAG, Depth + 1))
44024 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44025 break;
44026 }
44027 case X86ISD::VPERMV:
44028 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
44029 Depth))
44030 return true;
44031 break;
44032 case X86ISD::PSHUFB:
44033 case X86ISD::VPERMV3:
44034 case X86ISD::VPERMILPV:
44035 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
44036 Depth))
44037 return true;
44038 break;
44039 case X86ISD::VPPERM:
44040 case X86ISD::VPERMIL2:
44041 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
44042 Depth))
44043 return true;
44044 break;
44045 }
44046
44047 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
44048 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
44049 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
44050 if ((VT.is256BitVector() || VT.is512BitVector()) &&
44051 DemandedElts.lshr(NumElts / 2) == 0) {
44052 unsigned SizeInBits = VT.getSizeInBits();
44053 unsigned ExtSizeInBits = SizeInBits / 2;
44054
44055 // See if 512-bit ops only use the bottom 128-bits.
44056 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
44057 ExtSizeInBits = SizeInBits / 4;
44058
44059 switch (Opc) {
44060 // Scalar broadcast.
44061 case X86ISD::VBROADCAST: {
44062 SDLoc DL(Op);
44063 SDValue Src = Op.getOperand(0);
44064 if (Src.getValueSizeInBits() > ExtSizeInBits)
44065 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
44066 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44067 ExtSizeInBits / VT.getScalarSizeInBits());
44068 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
44069 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44070 TLO.DAG, DL, ExtSizeInBits));
44071 }
44073 SDLoc DL(Op);
44074 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44075 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44076 ExtSizeInBits / VT.getScalarSizeInBits());
44077 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
44078 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
44079 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
44080 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
44081 MemIntr->getMemOperand());
44083 Bcst.getValue(1));
44084 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44085 TLO.DAG, DL, ExtSizeInBits));
44086 }
44087 // Subvector broadcast.
44089 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44090 EVT MemVT = MemIntr->getMemoryVT();
44091 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
44092 SDLoc DL(Op);
44093 SDValue Ld =
44094 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
44095 MemIntr->getBasePtr(), MemIntr->getMemOperand());
44097 Ld.getValue(1));
44098 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
44099 TLO.DAG, DL, ExtSizeInBits));
44100 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
44101 SDLoc DL(Op);
44102 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44103 ExtSizeInBits / VT.getScalarSizeInBits());
44104 if (SDValue BcstLd =
44105 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
44106 return TLO.CombineTo(Op,
44107 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
44108 TLO.DAG, DL, ExtSizeInBits));
44109 }
44110 break;
44111 }
44112 // Byte shifts by immediate.
44113 case X86ISD::VSHLDQ:
44114 case X86ISD::VSRLDQ:
44115 // Shift by uniform.
44116 case X86ISD::VSHL:
44117 case X86ISD::VSRL:
44118 case X86ISD::VSRA:
44119 // Shift by immediate.
44120 case X86ISD::VSHLI:
44121 case X86ISD::VSRLI:
44122 case X86ISD::VSRAI: {
44123 SDLoc DL(Op);
44124 SDValue Ext0 =
44125 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
44126 SDValue ExtOp =
44127 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
44128 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44129 SDValue Insert =
44130 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44131 return TLO.CombineTo(Op, Insert);
44132 }
44133 case X86ISD::VPERMI: {
44134 // Simplify 256-bit PERMPD/PERMQ to extract_subvector.
44135 // TODO: This should be done in shuffle combining.
44136 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
44138 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
44139 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
44140 SDLoc DL(Op);
44141 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
44142 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44143 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
44144 return TLO.CombineTo(Op, Insert);
44145 }
44146 }
44147 // Simplify 512-bit PERMPD/PERMQ to 256-bit variant on lower half.
44148 if (VT == MVT::v8f64 || VT == MVT::v8i64) {
44149 SDLoc DL(Op);
44150 SDValue Ext0 = extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, 256);
44151 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0,
44152 Op.getOperand(1));
44153 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44154 SDValue Insert = insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, 256);
44155 return TLO.CombineTo(Op, Insert);
44156 }
44157 break;
44158 }
44159 case X86ISD::VPERMV: {
44162 // We can always split v16i32/v16f32 AVX512 to v8i32/v8f32 AVX2 variants.
44163 if ((VT.is256BitVector() || Subtarget.hasVLX() || VT == MVT::v16i32 ||
44164 VT == MVT::v16f32) &&
44165 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44166 // For lane-crossing shuffles, only split in half in case we're still
44167 // referencing higher elements.
44168 unsigned HalfElts = NumElts / 2;
44169 unsigned HalfSize = SizeInBits / 2;
44170 Mask.resize(HalfElts);
44171 if (all_of(Mask,
44172 [&](int M) { return isUndefOrInRange(M, 0, HalfElts); })) {
44174 SDLoc DL(Op);
44175 SDValue Ext;
44176 SDValue M =
44177 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize);
44178 SDValue V =
44179 extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, HalfSize);
44180 // For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS.
44181 if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16)
44182 Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V);
44183 else {
44185 MVT ShufVT = HalfVT.changeVectorElementType(ShufSVT);
44186 Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, ShufVT,
44187 TLO.DAG.getBitcast(ShufVT, V), M);
44188 Ext = TLO.DAG.getBitcast(HalfVT, Ext);
44189 }
44190 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44191 Subtarget, TLO.DAG, DL, SizeInBits);
44192 return TLO.CombineTo(Op, Insert);
44193 }
44194 }
44195 break;
44196 }
44197 case X86ISD::VPERMV3: {
44200 if (Subtarget.hasVLX() &&
44201 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44202 // For lane-crossing shuffles, only split in half in case we're still
44203 // referencing higher elements.
44204 unsigned HalfElts = NumElts / 2;
44205 unsigned HalfSize = SizeInBits / 2;
44206 Mask.resize(HalfElts);
44207 if (all_of(Mask, [&](int M) {
44208 return isUndefOrInRange(M, 0, HalfElts) ||
44209 isUndefOrInRange(M, NumElts, NumElts + HalfElts);
44210 })) {
44211 // Adjust mask elements for 2nd operand to point to half width.
44212 for (int &M : Mask)
44213 M = (M < NumElts) ? M : (M - HalfElts);
44215 MVT HalfIntVT = HalfVT.changeVectorElementTypeToInteger();
44216 SDLoc DL(Op);
44217 SDValue Ext = TLO.DAG.getNode(
44218 Opc, DL, HalfVT,
44219 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize),
44220 getConstVector(Mask, HalfIntVT, TLO.DAG, DL, /*IsMask=*/true),
44221 extractSubVector(Op.getOperand(2), 0, TLO.DAG, DL, HalfSize));
44222 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44223 Subtarget, TLO.DAG, DL, SizeInBits);
44224 return TLO.CombineTo(Op, Insert);
44225 }
44226 }
44227 break;
44228 }
44229 case X86ISD::VPERM2X128: {
44230 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
44231 SDLoc DL(Op);
44232 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
44233 if (LoMask & 0x8)
44234 return TLO.CombineTo(
44235 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
44236 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
44237 unsigned SrcIdx = (LoMask & 0x2) >> 1;
44238 SDValue ExtOp =
44239 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
44240 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44241 SDValue Insert =
44242 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44243 return TLO.CombineTo(Op, Insert);
44244 }
44245 // Conversions.
44246 // TODO: Add more CVT opcodes when we have test coverage.
44247 case X86ISD::CVTTP2UI: {
44248 if (!Subtarget.hasVLX())
44249 break;
44250 [[fallthrough]];
44251 }
44252 case X86ISD::CVTTP2SI: {
44253 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f16 &&
44254 !Subtarget.hasVLX())
44255 break;
44256 [[fallthrough]];
44257 }
44258 case X86ISD::CVTPH2PS: {
44259 SDLoc DL(Op);
44260 unsigned Scale = SizeInBits / ExtSizeInBits;
44261 SDValue SrcOp = Op.getOperand(0);
44262 MVT SrcVT = SrcOp.getSimpleValueType();
44263 unsigned SrcExtSize =
44264 std::max<unsigned>(SrcVT.getSizeInBits() / Scale, 128);
44266 ExtSizeInBits / VT.getScalarSizeInBits());
44267 SDValue ExtOp = TLO.DAG.getNode(
44268 Opc, DL, ExtVT, extractSubVector(SrcOp, 0, TLO.DAG, DL, SrcExtSize));
44269 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44270 SDValue Insert =
44271 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44272 return TLO.CombineTo(Op, Insert);
44273 }
44274 // Zero upper elements.
44275 case X86ISD::VZEXT_MOVL:
44276 // Variable blend.
44277 case X86ISD::BLENDV:
44278 // Target unary shuffles:
44279 case X86ISD::MOVDDUP:
44280 // Target unary shuffles by immediate:
44281 case X86ISD::PSHUFD:
44282 case X86ISD::PSHUFLW:
44283 case X86ISD::PSHUFHW:
44284 case X86ISD::VPERMILPI:
44285 // (Non-Lane Crossing) Target Shuffles.
44286 case X86ISD::VPERMILPV:
44287 case X86ISD::VPERMIL2:
44288 case X86ISD::PSHUFB:
44289 case X86ISD::UNPCKL:
44290 case X86ISD::UNPCKH:
44291 case X86ISD::BLENDI:
44292 // Integer ops.
44293 case X86ISD::PACKSS:
44294 case X86ISD::PACKUS:
44295 case X86ISD::PCMPEQ:
44296 case X86ISD::PCMPGT:
44297 case X86ISD::PMULUDQ:
44298 case X86ISD::PMULDQ:
44299 case X86ISD::VSHLV:
44300 case X86ISD::VSRLV:
44301 case X86ISD::VSRAV:
44302 // Float ops.
44303 case X86ISD::FMAX:
44304 case X86ISD::FMIN:
44305 case X86ISD::FMAXC:
44306 case X86ISD::FMINC:
44307 case X86ISD::FRSQRT:
44308 case X86ISD::FRCP:
44309 // Horizontal Ops.
44310 case X86ISD::HADD:
44311 case X86ISD::HSUB:
44312 case X86ISD::FHADD:
44313 case X86ISD::FHSUB: {
44314 SDLoc DL(Op);
44316 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
44317 SDValue SrcOp = Op.getOperand(i);
44318 EVT SrcVT = SrcOp.getValueType();
44319 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
44320 "Unsupported vector size");
44321 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
44322 ExtSizeInBits)
44323 : SrcOp);
44324 }
44325 MVT ExtVT = VT.getSimpleVT();
44326 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
44327 ExtSizeInBits / ExtVT.getScalarSizeInBits());
44328 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
44329 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44330 SDValue Insert =
44331 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44332 return TLO.CombineTo(Op, Insert);
44333 }
44334 }
44335 }
44336
44337 // For splats, unless we *only* demand the 0'th element,
44338 // stop attempts at simplification here, we aren't going to improve things,
44339 // this is better than any potential shuffle.
44340 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
44341 return false;
44342
44343 // Get target/faux shuffle mask.
44344 APInt OpUndef, OpZero;
44345 SmallVector<int, 64> OpMask;
44346 SmallVector<SDValue, 2> OpInputs;
44347 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
44348 OpZero, TLO.DAG, Depth, false))
44349 return false;
44350
44351 // Shuffle inputs must be the same size as the result.
44352 if (OpMask.size() != (unsigned)NumElts ||
44353 llvm::any_of(OpInputs, [VT](SDValue V) {
44354 return VT.getSizeInBits() != V.getValueSizeInBits() ||
44355 !V.getValueType().isVector();
44356 }))
44357 return false;
44358
44359 KnownZero = OpZero;
44360 KnownUndef = OpUndef;
44361
44362 // Check if shuffle mask can be simplified to undef/zero/identity.
44363 int NumSrcs = OpInputs.size();
44364 for (int i = 0; i != NumElts; ++i)
44365 if (!DemandedElts[i])
44366 OpMask[i] = SM_SentinelUndef;
44367
44368 if (isUndefInRange(OpMask, 0, NumElts)) {
44369 KnownUndef.setAllBits();
44370 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
44371 }
44372 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
44373 KnownZero.setAllBits();
44374 return TLO.CombineTo(
44375 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
44376 }
44377 for (int Src = 0; Src != NumSrcs; ++Src)
44378 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
44379 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
44380
44381 // Attempt to simplify inputs.
44382 for (int Src = 0; Src != NumSrcs; ++Src) {
44383 // TODO: Support inputs of different types.
44384 if (OpInputs[Src].getValueType() != VT)
44385 continue;
44386
44387 int Lo = Src * NumElts;
44388 APInt SrcElts = APInt::getZero(NumElts);
44389 for (int i = 0; i != NumElts; ++i)
44390 if (DemandedElts[i]) {
44391 int M = OpMask[i] - Lo;
44392 if (0 <= M && M < NumElts)
44393 SrcElts.setBit(M);
44394 }
44395
44396 // TODO - Propagate input undef/zero elts.
44397 APInt SrcUndef, SrcZero;
44398 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
44399 TLO, Depth + 1))
44400 return true;
44401 }
44402
44403 // If we don't demand all elements, then attempt to combine to a simpler
44404 // shuffle.
44405 // We need to convert the depth to something combineX86ShufflesRecursively
44406 // can handle - so pretend its Depth == 0 again, and reduce the max depth
44407 // to match. This prevents combineX86ShuffleChain from returning a
44408 // combined shuffle that's the same as the original root, causing an
44409 // infinite loop.
44410 if (!DemandedElts.isAllOnes()) {
44411 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
44412
44413 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
44414 for (int i = 0; i != NumElts; ++i)
44415 if (DemandedElts[i])
44416 DemandedMask[i] = i;
44417
44419 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), DemandedMask, {}, 0,
44421 /*AllowVariableCrossLaneMask=*/true,
44422 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget),
44423 TLO.DAG, SDLoc(Op), Subtarget);
44424 if (NewShuffle)
44425 return TLO.CombineTo(Op, NewShuffle);
44426 }
44427
44428 return false;
44429}
44430
44432 SDValue Op, const APInt &OriginalDemandedBits,
44433 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
44434 unsigned Depth) const {
44435 EVT VT = Op.getValueType();
44436 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
44437 unsigned Opc = Op.getOpcode();
44438 switch(Opc) {
44439 case X86ISD::VTRUNC: {
44440 KnownBits KnownOp;
44441 SDValue Src = Op.getOperand(0);
44442 MVT SrcVT = Src.getSimpleValueType();
44443
44444 // Simplify the input, using demanded bit information.
44445 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
44446 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
44447 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
44448 return true;
44449 break;
44450 }
44451 case X86ISD::PMULDQ:
44452 case X86ISD::PMULUDQ: {
44453 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
44454 KnownBits KnownLHS, KnownRHS;
44455 SDValue LHS = Op.getOperand(0);
44456 SDValue RHS = Op.getOperand(1);
44457
44458 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
44459 // FIXME: Can we bound this better?
44460 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
44461 APInt DemandedMaskLHS = APInt::getAllOnes(64);
44462 APInt DemandedMaskRHS = APInt::getAllOnes(64);
44463
44464 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
44465 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
44466 DemandedMaskLHS = DemandedMask;
44467 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
44468 DemandedMaskRHS = DemandedMask;
44469
44470 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
44471 KnownLHS, TLO, Depth + 1))
44472 return true;
44473 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
44474 KnownRHS, TLO, Depth + 1))
44475 return true;
44476
44477 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
44478 KnownRHS = KnownRHS.trunc(32);
44479 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
44480 KnownRHS.getConstant().isOne()) {
44481 SDLoc DL(Op);
44482 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
44483 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
44484 }
44485
44486 // Aggressively peek through ops to get at the demanded low bits.
44488 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44490 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44491 if (DemandedLHS || DemandedRHS) {
44492 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
44493 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
44494 return TLO.CombineTo(
44495 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
44496 }
44497 break;
44498 }
44499 case X86ISD::ANDNP: {
44500 KnownBits Known2;
44501 SDValue Op0 = Op.getOperand(0);
44502 SDValue Op1 = Op.getOperand(1);
44503
44504 if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
44505 Known, TLO, Depth + 1))
44506 return true;
44507
44508 if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
44509 OriginalDemandedElts, Known2, TLO, Depth + 1))
44510 return true;
44511
44512 // If the RHS is a constant, see if we can simplify it.
44513 if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
44514 OriginalDemandedElts, TLO))
44515 return true;
44516
44517 // ANDNP = (~Op0 & Op1);
44518 Known.One &= Known2.Zero;
44519 Known.Zero |= Known2.One;
44520 break;
44521 }
44522 case X86ISD::VSHLI: {
44523 SDValue Op0 = Op.getOperand(0);
44524 SDValue Op1 = Op.getOperand(1);
44525
44526 unsigned ShAmt = Op1->getAsZExtVal();
44527 if (ShAmt >= BitWidth)
44528 break;
44529
44530 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
44531
44532 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
44533 // single shift. We can do this if the bottom bits (which are shifted
44534 // out) are never demanded.
44535 if (Op0.getOpcode() == X86ISD::VSRLI &&
44536 OriginalDemandedBits.countr_zero() >= ShAmt) {
44537 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
44538 if (Shift2Amt < BitWidth) {
44539 int Diff = ShAmt - Shift2Amt;
44540 if (Diff == 0)
44541 return TLO.CombineTo(Op, Op0.getOperand(0));
44542
44543 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
44544 SDValue NewShift = TLO.DAG.getNode(
44545 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
44546 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
44547 return TLO.CombineTo(Op, NewShift);
44548 }
44549 }
44550
44551 // If we are only demanding sign bits then we can use the shift source directly.
44552 unsigned NumSignBits =
44553 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
44554 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
44555 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
44556 return TLO.CombineTo(Op, Op0);
44557
44558 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44559 TLO, Depth + 1))
44560 return true;
44561
44562 Known <<= ShAmt;
44563
44564 // Low bits known zero.
44565 Known.Zero.setLowBits(ShAmt);
44566
44567 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44568 // Attempt to avoid multi-use ops if we don't need anything from them.
44569 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44570 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44571 SDValue NewOp =
44572 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44573 return TLO.CombineTo(Op, NewOp);
44574 }
44575 }
44576 return false;
44577 }
44578 case X86ISD::VSRLI: {
44579 SDValue Op0 = Op.getOperand(0);
44580 SDValue Op1 = Op.getOperand(1);
44581
44582 unsigned ShAmt = Op1->getAsZExtVal();
44583 if (ShAmt >= BitWidth)
44584 break;
44585
44586 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44587
44588 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44589 TLO, Depth + 1))
44590 return true;
44591
44592 Known >>= ShAmt;
44593
44594 // High bits known zero.
44595 Known.Zero.setHighBits(ShAmt);
44596
44597 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44598 // Attempt to avoid multi-use ops if we don't need anything from them.
44599 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44600 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44601 SDValue NewOp =
44602 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44603 return TLO.CombineTo(Op, NewOp);
44604 }
44605 }
44606 return false;
44607 }
44608 case X86ISD::VSRAI: {
44609 SDValue Op0 = Op.getOperand(0);
44610 SDValue Op1 = Op.getOperand(1);
44611
44612 unsigned ShAmt = Op1->getAsZExtVal();
44613 if (ShAmt >= BitWidth)
44614 break;
44615
44616 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44617
44618 // If we just want the sign bit then we don't need to shift it.
44619 if (OriginalDemandedBits.isSignMask())
44620 return TLO.CombineTo(Op, Op0);
44621
44622 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
44623 if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
44624 SDValue Op00 = Op0.getOperand(0);
44625 unsigned NumSignBits =
44626 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
44627 if (ShAmt < NumSignBits)
44628 return TLO.CombineTo(Op, Op00);
44629 }
44630
44631 // If any of the demanded bits are produced by the sign extension, we also
44632 // demand the input sign bit.
44633 if (OriginalDemandedBits.countl_zero() < ShAmt)
44634 DemandedMask.setSignBit();
44635
44636 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44637 TLO, Depth + 1))
44638 return true;
44639
44640 Known >>= ShAmt;
44641
44642 // If the input sign bit is known to be zero, or if none of the top bits
44643 // are demanded, turn this into an unsigned shift right.
44644 if (Known.Zero[BitWidth - ShAmt - 1] ||
44645 OriginalDemandedBits.countl_zero() >= ShAmt)
44646 return TLO.CombineTo(
44647 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
44648
44649 // High bits are known one.
44650 if (Known.One[BitWidth - ShAmt - 1])
44651 Known.One.setHighBits(ShAmt);
44652
44653 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44654 // Attempt to avoid multi-use ops if we don't need anything from them.
44655 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44656 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44657 SDValue NewOp =
44658 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44659 return TLO.CombineTo(Op, NewOp);
44660 }
44661 }
44662 return false;
44663 }
44664 case X86ISD::BLENDI: {
44665 SDValue LHS = Op.getOperand(0);
44666 SDValue RHS = Op.getOperand(1);
44667 APInt Mask = getBLENDIBlendMask(Op);
44668
44669 APInt DemandedEltsLHS = OriginalDemandedElts & ~Mask;
44670 if (SimplifyDemandedBits(LHS, OriginalDemandedBits, DemandedEltsLHS, Known,
44671 TLO, Depth + 1))
44672 return true;
44673
44674 APInt DemandedEltsRHS = OriginalDemandedElts & Mask;
44675 if (SimplifyDemandedBits(RHS, OriginalDemandedBits, DemandedEltsRHS, Known,
44676 TLO, Depth + 1))
44677 return true;
44678
44679 // Attempt to avoid multi-use ops if we don't need anything from them.
44681 LHS, OriginalDemandedBits, DemandedEltsLHS, TLO.DAG, Depth + 1);
44683 RHS, OriginalDemandedBits, DemandedEltsRHS, TLO.DAG, Depth + 1);
44684 if (NewLHS || NewRHS) {
44685 NewLHS = NewLHS ? NewLHS : LHS;
44686 NewRHS = NewRHS ? NewRHS : RHS;
44687 return TLO.CombineTo(Op,
44688 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT,
44689 NewLHS, NewRHS, Op.getOperand(2)));
44690 }
44691 break;
44692 }
44693 case X86ISD::BLENDV: {
44694 SDValue Sel = Op.getOperand(0);
44695 SDValue LHS = Op.getOperand(1);
44696 SDValue RHS = Op.getOperand(2);
44697
44698 APInt SignMask = APInt::getSignMask(BitWidth);
44700 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
44702 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44704 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44705
44706 if (NewSel || NewLHS || NewRHS) {
44707 NewSel = NewSel ? NewSel : Sel;
44708 NewLHS = NewLHS ? NewLHS : LHS;
44709 NewRHS = NewRHS ? NewRHS : RHS;
44710 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
44711 NewSel, NewLHS, NewRHS));
44712 }
44713 break;
44714 }
44715 case X86ISD::PEXTRB:
44716 case X86ISD::PEXTRW: {
44717 SDValue Vec = Op.getOperand(0);
44718 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
44719 MVT VecVT = Vec.getSimpleValueType();
44720 unsigned NumVecElts = VecVT.getVectorNumElements();
44721
44722 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
44723 unsigned Idx = CIdx->getZExtValue();
44724 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
44725
44726 // If we demand no bits from the vector then we must have demanded
44727 // bits from the implict zext - simplify to zero.
44728 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
44729 if (DemandedVecBits == 0)
44730 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44731
44732 APInt KnownUndef, KnownZero;
44733 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
44734 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
44735 KnownZero, TLO, Depth + 1))
44736 return true;
44737
44738 KnownBits KnownVec;
44739 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
44740 KnownVec, TLO, Depth + 1))
44741 return true;
44742
44744 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
44745 return TLO.CombineTo(
44746 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
44747
44748 Known = KnownVec.zext(BitWidth);
44749 return false;
44750 }
44751 break;
44752 }
44753 case X86ISD::PINSRB:
44754 case X86ISD::PINSRW: {
44755 SDValue Vec = Op.getOperand(0);
44756 SDValue Scl = Op.getOperand(1);
44757 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
44758 MVT VecVT = Vec.getSimpleValueType();
44759
44760 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
44761 unsigned Idx = CIdx->getZExtValue();
44762 if (!OriginalDemandedElts[Idx])
44763 return TLO.CombineTo(Op, Vec);
44764
44765 KnownBits KnownVec;
44766 APInt DemandedVecElts(OriginalDemandedElts);
44767 DemandedVecElts.clearBit(Idx);
44768 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
44769 KnownVec, TLO, Depth + 1))
44770 return true;
44771
44772 KnownBits KnownScl;
44773 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
44774 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
44775 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
44776 return true;
44777
44778 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
44779 Known = KnownVec.intersectWith(KnownScl);
44780 return false;
44781 }
44782 break;
44783 }
44784 case X86ISD::PACKSS:
44785 // PACKSS saturates to MIN/MAX integer values. So if we just want the
44786 // sign bit then we can just ask for the source operands sign bit.
44787 // TODO - add known bits handling.
44788 if (OriginalDemandedBits.isSignMask()) {
44789 APInt DemandedLHS, DemandedRHS;
44790 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
44791
44792 KnownBits KnownLHS, KnownRHS;
44793 APInt SignMask = APInt::getSignMask(BitWidth * 2);
44794 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
44795 KnownLHS, TLO, Depth + 1))
44796 return true;
44797 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
44798 KnownRHS, TLO, Depth + 1))
44799 return true;
44800
44801 // Attempt to avoid multi-use ops if we don't need anything from them.
44803 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
44805 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
44806 if (DemandedOp0 || DemandedOp1) {
44807 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
44808 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
44809 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
44810 }
44811 }
44812 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
44813 break;
44814 case X86ISD::VBROADCAST: {
44815 SDValue Src = Op.getOperand(0);
44816 MVT SrcVT = Src.getSimpleValueType();
44817 APInt DemandedElts = APInt::getOneBitSet(
44818 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
44819 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
44820 TLO, Depth + 1))
44821 return true;
44822 // If we don't need the upper bits, attempt to narrow the broadcast source.
44823 // Don't attempt this on AVX512 as it might affect broadcast folding.
44824 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
44825 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
44826 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
44827 Src->hasOneUse()) {
44828 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
44829 SDValue NewSrc =
44830 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
44831 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
44832 SDValue NewBcst =
44833 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
44834 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
44835 }
44836 break;
44837 }
44838 case X86ISD::PCMPGT:
44839 // icmp sgt(0, R) == ashr(R, BitWidth-1).
44840 // iff we only need the sign bit then we can use R directly.
44841 if (OriginalDemandedBits.isSignMask() &&
44842 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
44843 return TLO.CombineTo(Op, Op.getOperand(1));
44844 break;
44845 case X86ISD::MOVMSK: {
44846 SDValue Src = Op.getOperand(0);
44847 MVT SrcVT = Src.getSimpleValueType();
44848 unsigned SrcBits = SrcVT.getScalarSizeInBits();
44849 unsigned NumElts = SrcVT.getVectorNumElements();
44850
44851 // If we don't need the sign bits at all just return zero.
44852 if (OriginalDemandedBits.countr_zero() >= NumElts)
44853 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44854
44855 // See if we only demand bits from the lower 128-bit vector.
44856 if (SrcVT.is256BitVector() &&
44857 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
44858 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
44859 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44860 }
44861
44862 // Only demand the vector elements of the sign bits we need.
44863 APInt KnownUndef, KnownZero;
44864 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
44865 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
44866 TLO, Depth + 1))
44867 return true;
44868
44869 Known.Zero = KnownZero.zext(BitWidth);
44870 Known.Zero.setHighBits(BitWidth - NumElts);
44871
44872 // MOVMSK only uses the MSB from each vector element.
44873 KnownBits KnownSrc;
44874 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
44875 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
44876 Depth + 1))
44877 return true;
44878
44879 if (KnownSrc.One[SrcBits - 1])
44880 Known.One.setLowBits(NumElts);
44881 else if (KnownSrc.Zero[SrcBits - 1])
44882 Known.Zero.setLowBits(NumElts);
44883
44884 // Attempt to avoid multi-use os if we don't need anything from it.
44886 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
44887 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44888 return false;
44889 }
44890 case X86ISD::TESTP: {
44891 SDValue Op0 = Op.getOperand(0);
44892 SDValue Op1 = Op.getOperand(1);
44893 MVT OpVT = Op0.getSimpleValueType();
44894 assert((OpVT.getVectorElementType() == MVT::f32 ||
44895 OpVT.getVectorElementType() == MVT::f64) &&
44896 "Illegal vector type for X86ISD::TESTP");
44897
44898 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
44899 KnownBits KnownSrc;
44900 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
44901 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
44902 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
44903 AssumeSingleUse) ||
44904 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
44905 AssumeSingleUse);
44906 }
44907 case X86ISD::CMOV: {
44908 KnownBits Known2;
44909 if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
44910 OriginalDemandedElts, Known2, TLO, Depth + 1))
44911 return true;
44912 if (SimplifyDemandedBits(Op.getOperand(0), OriginalDemandedBits,
44913 OriginalDemandedElts, Known, TLO, Depth + 1))
44914 return true;
44915
44916 // Only known if known in both the LHS and RHS.
44917 Known = Known.intersectWith(Known2);
44918 return false;
44919 }
44920 case X86ISD::BEXTR:
44921 case X86ISD::BEXTRI: {
44922 SDValue Op0 = Op.getOperand(0);
44923 SDValue Op1 = Op.getOperand(1);
44924
44925 // Only bottom 16-bits of the control bits are required.
44926 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
44927 // NOTE: SimplifyDemandedBits won't do this for constants.
44928 uint64_t Val1 = Cst1->getZExtValue();
44929 uint64_t MaskedVal1 = Val1 & 0xFFFF;
44930 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
44931 SDLoc DL(Op);
44932 return TLO.CombineTo(
44933 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
44934 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
44935 }
44936
44937 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
44938 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
44939
44940 // If the length is 0, the result is 0.
44941 if (Length == 0) {
44942 Known.setAllZero();
44943 return false;
44944 }
44945
44946 if ((Shift + Length) <= BitWidth) {
44947 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
44948 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
44949 return true;
44950
44951 Known = Known.extractBits(Length, Shift);
44952 Known = Known.zextOrTrunc(BitWidth);
44953 return false;
44954 }
44955 } else {
44956 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
44957 KnownBits Known1;
44958 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
44959 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
44960 return true;
44961
44962 // If the length is 0, replace with 0.
44963 KnownBits LengthBits = Known1.extractBits(8, 8);
44964 if (LengthBits.isZero())
44965 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44966 }
44967
44968 break;
44969 }
44970 case X86ISD::PDEP: {
44971 SDValue Op0 = Op.getOperand(0);
44972 SDValue Op1 = Op.getOperand(1);
44973
44974 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
44975 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
44976
44977 // If the demanded bits has leading zeroes, we don't demand those from the
44978 // mask.
44979 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
44980 return true;
44981
44982 // The number of possible 1s in the mask determines the number of LSBs of
44983 // operand 0 used. Undemanded bits from the mask don't matter so filter
44984 // them before counting.
44985 KnownBits Known2;
44986 uint64_t Count = (~Known.Zero & LoMask).popcount();
44987 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
44988 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
44989 return true;
44990
44991 // Zeroes are retained from the mask, but not ones.
44992 Known.One.clearAllBits();
44993 // The result will have at least as many trailing zeros as the non-mask
44994 // operand since bits can only map to the same or higher bit position.
44995 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
44996 return false;
44997 }
44998 case X86ISD::VPMADD52L:
44999 case X86ISD::VPMADD52H: {
45000 KnownBits KnownOp0, KnownOp1, KnownOp2;
45001 SDValue Op0 = Op.getOperand(0);
45002 SDValue Op1 = Op.getOperand(1);
45003 SDValue Op2 = Op.getOperand(2);
45004 // Only demand the lower 52-bits of operands 0 / 1 (and all 64-bits of
45005 // operand 2).
45006 APInt Low52Bits = APInt::getLowBitsSet(BitWidth, 52);
45007 if (SimplifyDemandedBits(Op0, Low52Bits, OriginalDemandedElts, KnownOp0,
45008 TLO, Depth + 1))
45009 return true;
45010
45011 if (SimplifyDemandedBits(Op1, Low52Bits, OriginalDemandedElts, KnownOp1,
45012 TLO, Depth + 1))
45013 return true;
45014
45015 if (SimplifyDemandedBits(Op2, APInt::getAllOnes(64), OriginalDemandedElts,
45016 KnownOp2, TLO, Depth + 1))
45017 return true;
45018
45019 KnownBits KnownMul;
45020 KnownOp0 = KnownOp0.trunc(52);
45021 KnownOp1 = KnownOp1.trunc(52);
45022 KnownMul = Opc == X86ISD::VPMADD52L ? KnownBits::mul(KnownOp0, KnownOp1)
45023 : KnownBits::mulhu(KnownOp0, KnownOp1);
45024 KnownMul = KnownMul.zext(64);
45025
45026 // lo/hi(X * Y) + Z --> C + Z
45027 if (KnownMul.isConstant()) {
45028 SDLoc DL(Op);
45029 SDValue C = TLO.DAG.getConstant(KnownMul.getConstant(), DL, VT);
45030 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ADD, DL, VT, C, Op2));
45031 }
45032
45033 Known = KnownBits::add(KnownMul, KnownOp2);
45034 return false;
45035 }
45036 }
45037
45039 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
45040}
45041
45043 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
45044 SelectionDAG &DAG, unsigned Depth) const {
45045 int NumElts = DemandedElts.getBitWidth();
45046 unsigned Opc = Op.getOpcode();
45047 EVT VT = Op.getValueType();
45048
45049 switch (Opc) {
45050 case X86ISD::PINSRB:
45051 case X86ISD::PINSRW: {
45052 // If we don't demand the inserted element, return the base vector.
45053 SDValue Vec = Op.getOperand(0);
45054 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
45055 MVT VecVT = Vec.getSimpleValueType();
45056 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
45057 !DemandedElts[CIdx->getZExtValue()])
45058 return Vec;
45059 break;
45060 }
45061 case X86ISD::VSHLI: {
45062 // If we are only demanding sign bits then we can use the shift source
45063 // directly.
45064 SDValue Op0 = Op.getOperand(0);
45065 unsigned ShAmt = Op.getConstantOperandVal(1);
45066 unsigned BitWidth = DemandedBits.getBitWidth();
45067 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
45068 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
45069 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
45070 return Op0;
45071 break;
45072 }
45073 case X86ISD::VSRAI:
45074 // iff we only need the sign bit then we can use the source directly.
45075 // TODO: generalize where we only demand extended signbits.
45076 if (DemandedBits.isSignMask())
45077 return Op.getOperand(0);
45078 break;
45079 case X86ISD::PCMPGT:
45080 // icmp sgt(0, R) == ashr(R, BitWidth-1).
45081 // iff we only need the sign bit then we can use R directly.
45082 if (DemandedBits.isSignMask() &&
45083 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
45084 return Op.getOperand(1);
45085 break;
45086 case X86ISD::BLENDV: {
45087 // BLENDV: Cond (MSB) ? LHS : RHS
45088 SDValue Cond = Op.getOperand(0);
45089 SDValue LHS = Op.getOperand(1);
45090 SDValue RHS = Op.getOperand(2);
45091
45092 KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
45093 if (CondKnown.isNegative())
45094 return LHS;
45095 if (CondKnown.isNonNegative())
45096 return RHS;
45097 break;
45098 }
45099 case X86ISD::ANDNP: {
45100 // ANDNP = (~LHS & RHS);
45101 SDValue LHS = Op.getOperand(0);
45102 SDValue RHS = Op.getOperand(1);
45103
45104 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
45105 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
45106
45107 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
45108 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
45109 // this context, so return RHS.
45110 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
45111 return RHS;
45112 break;
45113 }
45114 }
45115
45116 APInt ShuffleUndef, ShuffleZero;
45117 SmallVector<int, 16> ShuffleMask;
45119 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
45120 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
45121 // If all the demanded elts are from one operand and are inline,
45122 // then we can use the operand directly.
45123 int NumOps = ShuffleOps.size();
45124 if (ShuffleMask.size() == (unsigned)NumElts &&
45126 return VT.getSizeInBits() == V.getValueSizeInBits();
45127 })) {
45128
45129 if (DemandedElts.isSubsetOf(ShuffleUndef))
45130 return DAG.getUNDEF(VT);
45131 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
45132 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
45133
45134 // Bitmask that indicates which ops have only been accessed 'inline'.
45135 APInt IdentityOp = APInt::getAllOnes(NumOps);
45136 for (int i = 0; i != NumElts; ++i) {
45137 int M = ShuffleMask[i];
45138 if (!DemandedElts[i] || ShuffleUndef[i])
45139 continue;
45140 int OpIdx = M / NumElts;
45141 int EltIdx = M % NumElts;
45142 if (M < 0 || EltIdx != i) {
45143 IdentityOp.clearAllBits();
45144 break;
45145 }
45146 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
45147 if (IdentityOp == 0)
45148 break;
45149 }
45150 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
45151 "Multiple identity shuffles detected");
45152
45153 if (IdentityOp != 0)
45154 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
45155 }
45156 }
45157
45159 Op, DemandedBits, DemandedElts, DAG, Depth);
45160}
45161
45163 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45164 bool PoisonOnly, unsigned Depth) const {
45165 unsigned NumElts = DemandedElts.getBitWidth();
45166
45167 switch (Op.getOpcode()) {
45169 case X86ISD::Wrapper:
45170 case X86ISD::WrapperRIP:
45171 return true;
45172 case X86ISD::BLENDI:
45173 case X86ISD::PSHUFB:
45174 case X86ISD::PSHUFD:
45175 case X86ISD::UNPCKL:
45176 case X86ISD::UNPCKH:
45177 case X86ISD::VPERMILPV:
45178 case X86ISD::VPERMILPI:
45179 case X86ISD::VPERMV:
45180 case X86ISD::VPERMV3: {
45183 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
45184 SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),
45185 APInt::getZero(NumElts));
45186 for (auto M : enumerate(Mask)) {
45187 if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)
45188 continue;
45189 if (M.value() == SM_SentinelUndef)
45190 return false;
45191 assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&
45192 "Shuffle mask index out of range");
45193 DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);
45194 }
45195 for (auto Op : enumerate(Ops))
45196 if (!DemandedSrcElts[Op.index()].isZero() &&
45198 Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
45199 return false;
45200 return true;
45201 }
45202 break;
45203 }
45204 }
45206 Op, DemandedElts, DAG, PoisonOnly, Depth);
45207}
45208
45210 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45211 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
45212
45213 switch (Op.getOpcode()) {
45214 // SSE bit logic.
45215 case X86ISD::FAND:
45216 case X86ISD::FOR:
45217 case X86ISD::FXOR:
45218 case X86ISD::FANDN:
45219 case X86ISD::ANDNP:
45220 case X86ISD::VPTERNLOG:
45221 return false;
45222 // SSE vector insert/extracts use modulo indices.
45223 case X86ISD::PINSRB:
45224 case X86ISD::PINSRW:
45225 case X86ISD::PEXTRB:
45226 case X86ISD::PEXTRW:
45227 return false;
45228 // SSE vector multiplies are either inbounds or saturate.
45229 case X86ISD::VPMADDUBSW:
45230 case X86ISD::VPMADDWD:
45231 return false;
45232 // SSE vector shifts handle out of bounds shift amounts.
45233 case X86ISD::VSHLI:
45234 case X86ISD::VSRLI:
45235 case X86ISD::VSRAI:
45236 return false;
45237 // SSE blends.
45238 case X86ISD::BLENDI:
45239 case X86ISD::BLENDV:
45240 return false;
45241 // SSE target shuffles.
45242 case X86ISD::PSHUFB:
45243 case X86ISD::PSHUFD:
45244 case X86ISD::UNPCKL:
45245 case X86ISD::UNPCKH:
45246 case X86ISD::VPERMILPV:
45247 case X86ISD::VPERMILPI:
45248 case X86ISD::VPERMV:
45249 case X86ISD::VPERMV3:
45250 return false;
45251 // SSE comparisons handle all icmp/fcmp cases.
45252 // TODO: Add CMPM/MM with test coverage.
45253 case X86ISD::CMPP:
45254 case X86ISD::PCMPEQ:
45255 case X86ISD::PCMPGT:
45256 return false;
45257 // SSE signbit extraction.
45258 case X86ISD::MOVMSK:
45259 return false;
45260 // GFNI instructions.
45263 case X86ISD::GF2P8MULB:
45264 return false;
45266 switch (Op->getConstantOperandVal(0)) {
45267 case Intrinsic::x86_sse2_pmadd_wd:
45268 case Intrinsic::x86_avx2_pmadd_wd:
45269 case Intrinsic::x86_avx512_pmaddw_d_512:
45270 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
45271 case Intrinsic::x86_avx2_pmadd_ub_sw:
45272 case Intrinsic::x86_avx512_pmaddubs_w_512:
45273 return false;
45274 case Intrinsic::x86_avx512_vpermi2var_d_128:
45275 case Intrinsic::x86_avx512_vpermi2var_d_256:
45276 case Intrinsic::x86_avx512_vpermi2var_d_512:
45277 case Intrinsic::x86_avx512_vpermi2var_hi_128:
45278 case Intrinsic::x86_avx512_vpermi2var_hi_256:
45279 case Intrinsic::x86_avx512_vpermi2var_hi_512:
45280 case Intrinsic::x86_avx512_vpermi2var_pd_128:
45281 case Intrinsic::x86_avx512_vpermi2var_pd_256:
45282 case Intrinsic::x86_avx512_vpermi2var_pd_512:
45283 case Intrinsic::x86_avx512_vpermi2var_ps_128:
45284 case Intrinsic::x86_avx512_vpermi2var_ps_256:
45285 case Intrinsic::x86_avx512_vpermi2var_ps_512:
45286 case Intrinsic::x86_avx512_vpermi2var_q_128:
45287 case Intrinsic::x86_avx512_vpermi2var_q_256:
45288 case Intrinsic::x86_avx512_vpermi2var_q_512:
45289 case Intrinsic::x86_avx512_vpermi2var_qi_128:
45290 case Intrinsic::x86_avx512_vpermi2var_qi_256:
45291 case Intrinsic::x86_avx512_vpermi2var_qi_512:
45292 return false;
45293 }
45294 }
45296 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
45297}
45298
45300 const APInt &DemandedElts,
45301 APInt &UndefElts,
45302 const SelectionDAG &DAG,
45303 unsigned Depth) const {
45304 unsigned NumElts = DemandedElts.getBitWidth();
45305 unsigned Opc = Op.getOpcode();
45306
45307 switch (Opc) {
45308 case X86ISD::VBROADCAST:
45310 UndefElts = APInt::getZero(NumElts);
45311 return true;
45312 }
45313
45314 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
45315 DAG, Depth);
45316}
45317
45318// Helper to peek through bitops/trunc/setcc to determine size of source vector.
45319// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
45320static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
45321 bool AllowTruncate, unsigned Depth) {
45322 // Limit recursion.
45324 return false;
45325 switch (Src.getOpcode()) {
45326 case ISD::TRUNCATE:
45327 if (!AllowTruncate)
45328 return false;
45329 [[fallthrough]];
45330 case ISD::SETCC:
45331 return Src.getOperand(0).getValueSizeInBits() == Size;
45332 case ISD::FREEZE:
45333 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45334 Depth + 1);
45335 case ISD::AND:
45336 case ISD::XOR:
45337 case ISD::OR:
45338 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45339 Depth + 1) &&
45340 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45341 Depth + 1);
45342 case ISD::SELECT:
45343 case ISD::VSELECT:
45344 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
45345 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45346 Depth + 1) &&
45347 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate,
45348 Depth + 1);
45349 case ISD::BUILD_VECTOR:
45350 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
45351 ISD::isBuildVectorAllOnes(Src.getNode());
45352 }
45353 return false;
45354}
45355
45356// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
45357static unsigned getAltBitOpcode(unsigned Opcode) {
45358 switch(Opcode) {
45359 // clang-format off
45360 case ISD::AND: return X86ISD::FAND;
45361 case ISD::OR: return X86ISD::FOR;
45362 case ISD::XOR: return X86ISD::FXOR;
45363 case X86ISD::ANDNP: return X86ISD::FANDN;
45364 // clang-format on
45365 }
45366 llvm_unreachable("Unknown bitwise opcode");
45367}
45368
45369// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
45371 const SDLoc &DL) {
45372 EVT SrcVT = Src.getValueType();
45373 if (SrcVT != MVT::v4i1)
45374 return SDValue();
45375
45376 switch (Src.getOpcode()) {
45377 case ISD::SETCC:
45378 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
45379 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
45380 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
45381 SDValue Op0 = Src.getOperand(0);
45382 if (ISD::isNormalLoad(Op0.getNode()))
45383 return DAG.getBitcast(MVT::v4f32, Op0);
45384 if (Op0.getOpcode() == ISD::BITCAST &&
45385 Op0.getOperand(0).getValueType() == MVT::v4f32)
45386 return Op0.getOperand(0);
45387 }
45388 break;
45389 case ISD::AND:
45390 case ISD::XOR:
45391 case ISD::OR: {
45392 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
45393 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
45394 if (Op0 && Op1)
45395 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
45396 Op1);
45397 break;
45398 }
45399 }
45400 return SDValue();
45401}
45402
45403// Helper to push sign extension of vXi1 SETCC result through bitops.
45405 SDValue Src, const SDLoc &DL) {
45406 switch (Src.getOpcode()) {
45407 case ISD::SETCC:
45408 case ISD::FREEZE:
45409 case ISD::TRUNCATE:
45410 case ISD::BUILD_VECTOR:
45411 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45412 case ISD::AND:
45413 case ISD::XOR:
45414 case ISD::OR:
45415 return DAG.getNode(
45416 Src.getOpcode(), DL, SExtVT,
45417 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
45418 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
45419 case ISD::SELECT:
45420 case ISD::VSELECT:
45421 return DAG.getSelect(
45422 DL, SExtVT, Src.getOperand(0),
45423 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
45424 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
45425 }
45426 llvm_unreachable("Unexpected node type for vXi1 sign extension");
45427}
45428
45429// Try to match patterns such as
45430// (i16 bitcast (v16i1 x))
45431// ->
45432// (i16 movmsk (16i8 sext (v16i1 x)))
45433// before the illegal vector is scalarized on subtargets that don't have legal
45434// vxi1 types.
45436 const SDLoc &DL,
45437 const X86Subtarget &Subtarget) {
45438 EVT SrcVT = Src.getValueType();
45439 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
45440 return SDValue();
45441
45442 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
45443 // legalization destroys the v4i32 type.
45444 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
45445 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
45446 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
45447 DAG.getBitcast(MVT::v4f32, V));
45448 return DAG.getZExtOrTrunc(V, DL, VT);
45449 }
45450 }
45451
45452 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
45453 // movmskb even with avx512. This will be better than truncating to vXi1 and
45454 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
45455 // vpcmpeqb/vpcmpgtb.
45456 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
45457 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
45458 Src.getOperand(0).getValueType() == MVT::v32i8 ||
45459 Src.getOperand(0).getValueType() == MVT::v64i8);
45460
45461 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
45462 // directly with vpmovmskb/vmovmskps/vmovmskpd.
45463 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
45464 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
45465 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
45466 EVT CmpVT = Src.getOperand(0).getValueType();
45467 EVT EltVT = CmpVT.getVectorElementType();
45468 if (CmpVT.getSizeInBits() <= 256 &&
45469 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
45470 PreferMovMsk = true;
45471 }
45472
45473 // With AVX512 vxi1 types are legal and we prefer using k-regs.
45474 // MOVMSK is supported in SSE2 or later.
45475 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
45476 return SDValue();
45477
45478 // If the upper ops of a concatenation are undef, then try to bitcast the
45479 // lower op and extend.
45480 SmallVector<SDValue, 4> SubSrcOps;
45481 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
45482 SubSrcOps.size() >= 2) {
45483 SDValue LowerOp = SubSrcOps[0];
45484 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
45485 if (LowerOp.getOpcode() == ISD::SETCC &&
45486 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
45487 EVT SubVT = VT.getIntegerVT(
45488 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
45489 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
45490 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
45491 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
45492 }
45493 }
45494 }
45495
45496 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
45497 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
45498 // v8i16 and v16i16.
45499 // For these two cases, we can shuffle the upper element bytes to a
45500 // consecutive sequence at the start of the vector and treat the results as
45501 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
45502 // for v16i16 this is not the case, because the shuffle is expensive, so we
45503 // avoid sign-extending to this type entirely.
45504 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
45505 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
45506 MVT SExtVT;
45507 bool PropagateSExt = false;
45508 switch (SrcVT.getSimpleVT().SimpleTy) {
45509 default:
45510 return SDValue();
45511 case MVT::v2i1:
45512 SExtVT = MVT::v2i64;
45513 break;
45514 case MVT::v4i1:
45515 SExtVT = MVT::v4i32;
45516 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
45517 // sign-extend to a 256-bit operation to avoid truncation.
45518 if (Subtarget.hasAVX() &&
45519 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2(), 0)) {
45520 SExtVT = MVT::v4i64;
45521 PropagateSExt = true;
45522 }
45523 break;
45524 case MVT::v8i1:
45525 SExtVT = MVT::v8i16;
45526 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
45527 // sign-extend to a 256-bit operation to match the compare.
45528 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
45529 // 256-bit because the shuffle is cheaper than sign extending the result of
45530 // the compare.
45531 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true, 0) ||
45532 checkBitcastSrcVectorSize(Src, 512, true, 0))) {
45533 SExtVT = MVT::v8i32;
45534 PropagateSExt = true;
45535 }
45536 break;
45537 case MVT::v16i1:
45538 SExtVT = MVT::v16i8;
45539 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
45540 // it is not profitable to sign-extend to 256-bit because this will
45541 // require an extra cross-lane shuffle which is more expensive than
45542 // truncating the result of the compare to 128-bits.
45543 break;
45544 case MVT::v32i1:
45545 SExtVT = MVT::v32i8;
45546 break;
45547 case MVT::v64i1:
45548 // If we have AVX512F, but not AVX512BW and the input is truncated from
45549 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
45550 if (Subtarget.hasAVX512()) {
45551 if (Subtarget.hasBWI())
45552 return SDValue();
45553 SExtVT = MVT::v64i8;
45554 break;
45555 }
45556 // Split if this is a <64 x i8> comparison result.
45557 if (checkBitcastSrcVectorSize(Src, 512, false, 0)) {
45558 SExtVT = MVT::v64i8;
45559 break;
45560 }
45561 return SDValue();
45562 };
45563
45564 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
45565 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45566
45567 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
45568 V = getPMOVMSKB(DL, V, DAG, Subtarget);
45569 } else {
45570 if (SExtVT == MVT::v8i16) {
45571 V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
45572 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
45573 }
45574 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
45575 }
45576
45577 EVT IntVT =
45579 V = DAG.getZExtOrTrunc(V, DL, IntVT);
45580 return DAG.getBitcast(VT, V);
45581}
45582
45583// Convert a vXi1 constant build vector to the same width scalar integer.
45585 EVT SrcVT = Op.getValueType();
45586 assert(SrcVT.getVectorElementType() == MVT::i1 &&
45587 "Expected a vXi1 vector");
45589 "Expected a constant build vector");
45590
45591 APInt Imm(SrcVT.getVectorNumElements(), 0);
45592 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
45593 SDValue In = Op.getOperand(Idx);
45594 if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
45595 Imm.setBit(Idx);
45596 }
45597 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
45598 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
45599}
45600
45603 const X86Subtarget &Subtarget) {
45604 using namespace SDPatternMatch;
45605 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
45606
45607 if (!DCI.isBeforeLegalizeOps())
45608 return SDValue();
45609
45610 // Only do this if we have k-registers.
45611 if (!Subtarget.hasAVX512())
45612 return SDValue();
45613
45614 EVT DstVT = N->getValueType(0);
45615 SDValue Op = N->getOperand(0);
45616 EVT SrcVT = Op.getValueType();
45617
45618 // Make sure we have a bitcast between mask registers and a scalar type.
45619 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
45620 DstVT.isScalarInteger()) &&
45621 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
45622 SrcVT.isScalarInteger()))
45623 return SDValue();
45624
45625 SDValue LHS, RHS;
45626
45627 // Look for logic ops.
45629 return SDValue();
45630
45631 // If either operand was bitcast from DstVT, then perform logic with DstVT (at
45632 // least one of the getBitcast() will fold away).
45633 if (sd_match(LHS, m_OneUse(m_BitCast(m_SpecificVT(DstVT)))) ||
45635 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45636 DAG.getBitcast(DstVT, LHS), DAG.getBitcast(DstVT, RHS));
45637
45638 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
45639 // Most of these have to move a constant from the scalar domain anyway.
45642 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45643 DAG.getBitcast(DstVT, LHS), RHS);
45644 }
45645
45646 return SDValue();
45647}
45648
45650 const X86Subtarget &Subtarget) {
45651 SDLoc DL(BV);
45652 unsigned NumElts = BV->getNumOperands();
45653 SDValue Splat = BV->getSplatValue();
45654
45655 // Build MMX element from integer GPR or SSE float values.
45656 auto CreateMMXElement = [&](SDValue V) {
45657 if (V.isUndef())
45658 return DAG.getUNDEF(MVT::x86mmx);
45659 if (V.getValueType().isFloatingPoint()) {
45660 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
45661 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
45662 V = DAG.getBitcast(MVT::v2i64, V);
45663 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
45664 }
45665 V = DAG.getBitcast(MVT::i32, V);
45666 } else {
45667 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
45668 }
45669 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
45670 };
45671
45672 // Convert build vector ops to MMX data in the bottom elements.
45674
45675 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45676
45677 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
45678 if (Splat) {
45679 if (Splat.isUndef())
45680 return DAG.getUNDEF(MVT::x86mmx);
45681
45682 Splat = CreateMMXElement(Splat);
45683
45684 if (Subtarget.hasSSE1()) {
45685 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
45686 if (NumElts == 8)
45687 Splat = DAG.getNode(
45688 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45689 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
45690 TLI.getPointerTy(DAG.getDataLayout())),
45691 Splat, Splat);
45692
45693 // Use PSHUFW to repeat 16-bit elements.
45694 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
45695 return DAG.getNode(
45696 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45697 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
45698 TLI.getPointerTy(DAG.getDataLayout())),
45699 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
45700 }
45701 Ops.append(NumElts, Splat);
45702 } else {
45703 for (unsigned i = 0; i != NumElts; ++i)
45704 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
45705 }
45706
45707 // Use tree of PUNPCKLs to build up general MMX vector.
45708 while (Ops.size() > 1) {
45709 unsigned NumOps = Ops.size();
45710 unsigned IntrinOp =
45711 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
45712 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
45713 : Intrinsic::x86_mmx_punpcklbw));
45714 SDValue Intrin = DAG.getTargetConstant(
45715 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
45716 for (unsigned i = 0; i != NumOps; i += 2)
45717 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
45718 Ops[i], Ops[i + 1]);
45719 Ops.resize(NumOps / 2);
45720 }
45721
45722 return Ops[0];
45723}
45724
45725// Recursive function that attempts to find if a bool vector node was originally
45726// a vector/float/double that got truncated/extended/bitcast to/from a scalar
45727// integer. If so, replace the scalar ops with bool vector equivalents back down
45728// the chain.
45730 SelectionDAG &DAG,
45731 const X86Subtarget &Subtarget,
45732 unsigned Depth = 0) {
45734 return SDValue(); // Limit search depth.
45735
45736 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45737 unsigned Opc = V.getOpcode();
45738 switch (Opc) {
45739 case ISD::BITCAST: {
45740 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
45741 SDValue Src = V.getOperand(0);
45742 EVT SrcVT = Src.getValueType();
45743 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
45744 return DAG.getBitcast(VT, Src);
45745 break;
45746 }
45747 case ISD::Constant: {
45748 auto *C = cast<ConstantSDNode>(V);
45749 if (C->isZero())
45750 return DAG.getConstant(0, DL, VT);
45751 if (C->isAllOnes())
45752 return DAG.getAllOnesConstant(DL, VT);
45753 break;
45754 }
45755 case ISD::TRUNCATE: {
45756 // If we find a suitable source, a truncated scalar becomes a subvector.
45757 SDValue Src = V.getOperand(0);
45758 EVT NewSrcVT =
45759 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
45760 if (TLI.isTypeLegal(NewSrcVT))
45761 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45762 Subtarget, Depth + 1))
45763 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
45764 DAG.getVectorIdxConstant(0, DL));
45765 break;
45766 }
45767 case ISD::ANY_EXTEND:
45768 case ISD::ZERO_EXTEND: {
45769 // If we find a suitable source, an extended scalar becomes a subvector.
45770 SDValue Src = V.getOperand(0);
45771 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
45772 Src.getScalarValueSizeInBits());
45773 if (TLI.isTypeLegal(NewSrcVT))
45774 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45775 Subtarget, Depth + 1))
45776 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
45777 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
45778 : DAG.getConstant(0, DL, VT),
45779 N0, DAG.getVectorIdxConstant(0, DL));
45780 break;
45781 }
45782 case ISD::OR:
45783 case ISD::XOR: {
45784 // If we find suitable sources, we can just move the op to the vector
45785 // domain.
45786 if (SDValue N0 = combineBitcastToBoolVector(VT, V.getOperand(0), DL, DAG,
45787 Subtarget, Depth + 1))
45788 if (SDValue N1 = combineBitcastToBoolVector(VT, V.getOperand(1), DL, DAG,
45789 Subtarget, Depth + 1))
45790 return DAG.getNode(Opc, DL, VT, N0, N1);
45791 break;
45792 }
45793 case ISD::SHL: {
45794 // If we find a suitable source, a SHL becomes a KSHIFTL.
45795 SDValue Src0 = V.getOperand(0);
45796 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
45797 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
45798 break;
45799
45800 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
45801 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget,
45802 Depth + 1))
45803 return DAG.getNode(
45804 X86ISD::KSHIFTL, DL, VT, N0,
45805 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
45806 break;
45807 }
45808 }
45809
45810 // Does the inner bitcast already exist?
45811 if (Depth > 0)
45812 if (SDNode *Alt = DAG.getNodeIfExists(ISD::BITCAST, DAG.getVTList(VT), {V}))
45813 return SDValue(Alt, 0);
45814
45815 return SDValue();
45816}
45817
45820 const X86Subtarget &Subtarget) {
45821 SDValue N0 = N->getOperand(0);
45822 EVT VT = N->getValueType(0);
45823 EVT SrcVT = N0.getValueType();
45824 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45825
45826 // Try to match patterns such as
45827 // (i16 bitcast (v16i1 x))
45828 // ->
45829 // (i16 movmsk (16i8 sext (v16i1 x)))
45830 // before the setcc result is scalarized on subtargets that don't have legal
45831 // vxi1 types.
45832 if (DCI.isBeforeLegalize()) {
45833 SDLoc dl(N);
45834 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
45835 return V;
45836
45837 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45838 // type, widen both sides to avoid a trip through memory.
45839 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
45840 Subtarget.hasAVX512()) {
45841 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
45842 N0 = DAG.getBitcast(MVT::v8i1, N0);
45843 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
45844 DAG.getVectorIdxConstant(0, dl));
45845 }
45846
45847 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45848 // type, widen both sides to avoid a trip through memory.
45849 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
45850 Subtarget.hasAVX512()) {
45851 // Use zeros for the widening if we already have some zeroes. This can
45852 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
45853 // stream of this.
45854 // FIXME: It might make sense to detect a concat_vectors with a mix of
45855 // zeroes and undef and turn it into insert_subvector for i1 vectors as
45856 // a separate combine. What we can't do is canonicalize the operands of
45857 // such a concat or we'll get into a loop with SimplifyDemandedBits.
45858 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
45859 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
45860 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
45861 SrcVT = LastOp.getValueType();
45862 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45864 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
45865 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45866 N0 = DAG.getBitcast(MVT::i8, N0);
45867 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45868 }
45869 }
45870
45871 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45872 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
45873 Ops[0] = N0;
45874 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45875 N0 = DAG.getBitcast(MVT::i8, N0);
45876 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45877 }
45878 } else if (DCI.isAfterLegalizeDAG()) {
45879 // If we're bitcasting from iX to vXi1, see if the integer originally
45880 // began as a vXi1 and whether we can remove the bitcast entirely.
45881 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
45882 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
45883 if (SDValue V =
45884 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
45885 return V;
45886 }
45887 }
45888
45889 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
45890 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
45891 // due to insert_subvector legalization on KNL. By promoting the copy to i16
45892 // we can help with known bits propagation from the vXi1 domain to the
45893 // scalar domain.
45894 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
45895 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
45896 N0.getOperand(0).getValueType() == MVT::v16i1 &&
45898 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
45899 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
45900
45901 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
45902 // and the vbroadcast_load are both integer or both fp. In some cases this
45903 // will remove the bitcast entirely.
45904 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
45905 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
45906 auto *BCast = cast<MemIntrinsicSDNode>(N0);
45907 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
45908 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
45909 // Don't swap i8/i16 since don't have fp types that size.
45910 if (MemSize >= 32) {
45911 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
45912 : MVT::getIntegerVT(MemSize);
45913 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
45914 : MVT::getIntegerVT(SrcVTSize);
45915 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
45916
45917 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
45918 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
45919 SDValue ResNode =
45921 MemVT, BCast->getMemOperand());
45922 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
45923 return DAG.getBitcast(VT, ResNode);
45924 }
45925 }
45926
45927 // Attempt to peek through f16 bitcasted extractions hidden by truncation.
45928 if (VT == MVT::f16 && SrcVT == MVT::i16) {
45929 SDValue Src = peekThroughTruncates(N0);
45930 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45931 Src.getOperand(0).getValueSizeInBits() == 128 &&
45932 isNullConstant(Src.getOperand(1))) {
45933 SDLoc DL(N);
45934 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
45935 DAG.getBitcast(MVT::v8f16, Src.getOperand(0)),
45936 DAG.getVectorIdxConstant(0, DL));
45937 }
45938 }
45939
45940 // Since MMX types are special and don't usually play with other vector types,
45941 // it's better to handle them early to be sure we emit efficient code by
45942 // avoiding store-load conversions.
45943 if (VT == MVT::x86mmx) {
45944 // Detect MMX constant vectors.
45945 APInt UndefElts;
45946 SmallVector<APInt, 1> EltBits;
45947 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits,
45948 /*AllowWholeUndefs*/ true,
45949 /*AllowPartialUndefs*/ true)) {
45950 SDLoc DL(N0);
45951 // Handle zero-extension of i32 with MOVD.
45952 if (EltBits[0].countl_zero() >= 32)
45953 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
45954 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
45955 // Else, bitcast to a double.
45956 // TODO - investigate supporting sext 32-bit immediates on x86_64.
45957 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
45958 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
45959 }
45960
45961 // Detect bitcasts to x86mmx low word.
45962 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45963 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
45964 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
45965 bool LowUndef = true, AllUndefOrZero = true;
45966 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
45967 SDValue Op = N0.getOperand(i);
45968 LowUndef &= Op.isUndef() || (i >= e/2);
45969 AllUndefOrZero &= isNullConstantOrUndef(Op);
45970 }
45971 if (AllUndefOrZero) {
45972 SDValue N00 = N0.getOperand(0);
45973 SDLoc dl(N00);
45974 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
45975 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
45976 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
45977 }
45978 }
45979
45980 // Detect bitcasts of 64-bit build vectors and convert to a
45981 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
45982 // lowest element.
45983 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45984 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
45985 SrcVT == MVT::v8i8))
45986 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
45987
45988 // Detect bitcasts between element or subvector extraction to x86mmx.
45989 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
45991 isNullConstant(N0.getOperand(1))) {
45992 SDValue N00 = N0.getOperand(0);
45993 if (N00.getValueType().is128BitVector())
45994 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
45995 DAG.getBitcast(MVT::v2i64, N00));
45996 }
45997
45998 // Detect bitcasts from FP_TO_SINT to x86mmx.
45999 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
46000 SDLoc DL(N0);
46001 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
46002 DAG.getUNDEF(MVT::v2i32));
46003 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
46004 DAG.getBitcast(MVT::v2i64, Res));
46005 }
46006 }
46007
46008 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
46009 // most of these to scalar anyway.
46010 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
46011 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
46013 return combinevXi1ConstantToInteger(N0, DAG);
46014 }
46015
46016 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() &&
46017 VT.getVectorElementType() == MVT::i1) {
46018 if (auto *C = dyn_cast<ConstantSDNode>(N0)) {
46019 if (C->isAllOnes())
46020 return DAG.getConstant(1, SDLoc(N0), VT);
46021 if (C->isZero())
46022 return DAG.getConstant(0, SDLoc(N0), VT);
46023 }
46024 }
46025
46026 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
46027 // Turn it into a sign bit compare that produces a k-register. This avoids
46028 // a trip through a GPR.
46029 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
46030 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
46032 unsigned NumElts = VT.getVectorNumElements();
46033 SDValue Src = N0;
46034
46035 // Peek through truncate.
46036 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
46037 Src = N0.getOperand(0);
46038
46039 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
46040 SDValue MovmskIn = Src.getOperand(0);
46041 MVT MovmskVT = MovmskIn.getSimpleValueType();
46042 unsigned MovMskElts = MovmskVT.getVectorNumElements();
46043
46044 // We allow extra bits of the movmsk to be used since they are known zero.
46045 // We can't convert a VPMOVMSKB without avx512bw.
46046 if (MovMskElts <= NumElts &&
46047 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
46048 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
46049 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
46050 SDLoc dl(N);
46051 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
46052 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
46053 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
46054 if (EVT(CmpVT) == VT)
46055 return Cmp;
46056
46057 // Pad with zeroes up to original VT to replace the zeroes that were
46058 // being used from the MOVMSK.
46059 unsigned NumConcats = NumElts / MovMskElts;
46060 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
46061 Ops[0] = Cmp;
46062 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
46063 }
46064 }
46065 }
46066
46067 // Try to remove bitcasts from input and output of mask arithmetic to
46068 // remove GPR<->K-register crossings.
46069 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
46070 return V;
46071
46072 // bitcast(v1Ty insert_vector_elt(X, Y, 0)) --> Y
46073 if (N0.getOpcode() == ISD::INSERT_VECTOR_ELT && SrcVT.getScalarType() == VT &&
46074 SrcVT.getVectorNumElements() == 1)
46075 return N0.getOperand(1);
46076
46077 // Convert a bitcasted integer logic operation that has one bitcasted
46078 // floating-point operand into a floating-point logic operation. This may
46079 // create a load of a constant, but that is cheaper than materializing the
46080 // constant in an integer register and transferring it to an SSE register or
46081 // transferring the SSE operand to integer register and back.
46082 unsigned FPOpcode;
46083 switch (N0.getOpcode()) {
46084 // clang-format off
46085 case ISD::AND: FPOpcode = X86ISD::FAND; break;
46086 case ISD::OR: FPOpcode = X86ISD::FOR; break;
46087 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
46088 default: return SDValue();
46089 // clang-format on
46090 }
46091
46092 // Check if we have a bitcast from another integer type as well.
46093 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
46094 (Subtarget.hasSSE2() && VT == MVT::f64) ||
46095 (Subtarget.hasFP16() && VT == MVT::f16) ||
46096 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
46097 TLI.isTypeLegal(VT))))
46098 return SDValue();
46099
46100 SDValue LogicOp0 = N0.getOperand(0);
46101 SDValue LogicOp1 = N0.getOperand(1);
46102 SDLoc DL0(N0);
46103
46104 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
46105 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
46106 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
46107 LogicOp0.getOperand(0).getValueType() == VT &&
46108 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
46109 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
46110 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46111 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
46112 }
46113 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
46114 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
46115 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
46116 LogicOp1.getOperand(0).getValueType() == VT &&
46117 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
46118 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
46119 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46120 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
46121 }
46122
46123 return SDValue();
46124}
46125
46126// (mul (zext a), (sext, b))
46127static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
46128 SDValue &Op1) {
46129 Op0 = Mul.getOperand(0);
46130 Op1 = Mul.getOperand(1);
46131
46132 // The operand1 should be signed extend
46133 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
46134 std::swap(Op0, Op1);
46135
46136 auto IsFreeTruncation = [](SDValue &Op) -> bool {
46137 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
46138 Op.getOpcode() == ISD::SIGN_EXTEND) &&
46139 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
46140 return true;
46141
46142 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
46143 return (BV && BV->isConstant());
46144 };
46145
46146 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
46147 // value, we need to check Op0 is zero extended value. Op1 should be signed
46148 // value, so we just check the signed bits.
46149 if ((IsFreeTruncation(Op0) &&
46150 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
46151 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
46152 return true;
46153
46154 return false;
46155}
46156
46158 unsigned &LogBias, const SDLoc &DL,
46159 const X86Subtarget &Subtarget) {
46160 // Extend or truncate to MVT::i8 first.
46161 MVT Vi8VT =
46162 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
46163 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
46164 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
46165
46166 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
46167 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
46168 // The src A, B element type is i8, but the dst C element type is i32.
46169 // When we calculate the reduce stage, we use src vector type vXi8 for it
46170 // so we need logbias 2 to avoid extra 2 stages.
46171 LogBias = 2;
46172
46173 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
46174 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
46175 RegSize = std::max(512u, RegSize);
46176
46177 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
46178 // fill in the missing vector elements with 0.
46179 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
46180 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
46181 Ops[0] = LHS;
46182 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46183 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46184 Ops[0] = RHS;
46185 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46186
46187 // Actually build the DotProduct, split as 256/512 bits for
46188 // AVXVNNI/AVX512VNNI.
46189 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46191 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
46192 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
46193 };
46194 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
46195 SDValue Zero = DAG.getConstant(0, DL, DpVT);
46196
46197 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
46198 DpBuilder, /*CheckBWI=*/false, Subtarget.hasVNNI());
46199}
46200
46201// Create a PSADBW given two sources representable as zexts of vXi8.
46203 const SDLoc &DL, const X86Subtarget &Subtarget) {
46204 // Find the appropriate width for the PSADBW.
46205 EVT DstVT = N0.getValueType();
46206 EVT SrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8,
46207 DstVT.getVectorElementCount());
46208 unsigned RegSize = std::max(128u, (unsigned)SrcVT.getSizeInBits());
46209
46210 // Widen the vXi8 vectors, padding with zero vector elements.
46211 unsigned NumConcat = RegSize / SrcVT.getSizeInBits();
46212 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, SrcVT));
46213 Ops[0] = DAG.getZExtOrTrunc(N0, DL, SrcVT);
46214 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46215 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46216 Ops[0] = DAG.getZExtOrTrunc(N1, DL, SrcVT);
46217 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46218
46219 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
46220 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46222 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
46223 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
46224 };
46225 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
46226 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {SadOp0, SadOp1},
46227 PSADBWBuilder);
46228}
46229
46230// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
46231// PHMINPOSUW.
46233 const X86Subtarget &Subtarget) {
46234 // Bail without SSE41.
46235 if (!Subtarget.hasSSE41())
46236 return SDValue();
46237
46238 EVT ExtractVT = Extract->getValueType(0);
46239 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
46240 return SDValue();
46241
46242 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
46243 ISD::NodeType BinOp;
46244 SDValue Src = DAG.matchBinOpReduction(
46245 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
46246 if (!Src)
46247 return SDValue();
46248
46249 EVT SrcVT = Src.getValueType();
46250 EVT SrcSVT = SrcVT.getScalarType();
46251 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
46252 return SDValue();
46253
46254 SDLoc DL(Extract);
46255 SDValue MinPos = Src;
46256
46257 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
46258 while (SrcVT.getSizeInBits() > 128) {
46259 SDValue Lo, Hi;
46260 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
46261 SrcVT = Lo.getValueType();
46262 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
46263 }
46264 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
46265 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
46266 "Unexpected value type");
46267
46268 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
46269 // to flip the value accordingly.
46270 SDValue Mask;
46271 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
46272 if (BinOp == ISD::SMAX)
46273 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
46274 else if (BinOp == ISD::SMIN)
46275 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
46276 else if (BinOp == ISD::UMAX)
46277 Mask = DAG.getAllOnesConstant(DL, SrcVT);
46278
46279 if (Mask)
46280 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46281
46282 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
46283 // shuffling each upper element down and insert zeros. This means that the
46284 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
46285 // ready for the PHMINPOS.
46286 if (ExtractVT == MVT::i8) {
46288 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
46289 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
46290 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
46291 }
46292
46293 // Perform the PHMINPOS on a v8i16 vector,
46294 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
46295 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
46296 MinPos = DAG.getBitcast(SrcVT, MinPos);
46297
46298 if (Mask)
46299 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46300
46301 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
46302 DAG.getVectorIdxConstant(0, DL));
46303}
46304
46305// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
46307 const X86Subtarget &Subtarget) {
46308 // Bail without SSE2.
46309 if (!Subtarget.hasSSE2())
46310 return SDValue();
46311
46312 EVT ExtractVT = Extract->getValueType(0);
46313 unsigned BitWidth = ExtractVT.getSizeInBits();
46314 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
46315 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
46316 return SDValue();
46317
46318 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
46319 ISD::NodeType BinOp;
46320 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
46321 if (!Match && ExtractVT == MVT::i1)
46322 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
46323 if (!Match)
46324 return SDValue();
46325
46326 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
46327 // which we can't support here for now.
46328 if (Match.getScalarValueSizeInBits() != BitWidth)
46329 return SDValue();
46330
46331 SDValue Movmsk;
46332 SDLoc DL(Extract);
46333 EVT MatchVT = Match.getValueType();
46334 unsigned NumElts = MatchVT.getVectorNumElements();
46335 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
46336 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46337 LLVMContext &Ctx = *DAG.getContext();
46338
46339 if (ExtractVT == MVT::i1) {
46340 // Special case for (pre-legalization) vXi1 reductions.
46341 if (NumElts > 64 || !isPowerOf2_32(NumElts))
46342 return SDValue();
46343 if (Match.getOpcode() == ISD::SETCC) {
46344 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
46345 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
46346 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
46347 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
46348 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
46349 X86::CondCode X86CC;
46350 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
46351 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
46352 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
46353 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
46354 DAG, X86CC))
46355 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
46356 getSETCC(X86CC, V, DL, DAG));
46357 }
46358 }
46359 if (TLI.isTypeLegal(MatchVT)) {
46360 // If this is a legal AVX512 predicate type then we can just bitcast.
46361 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46362 Movmsk = DAG.getBitcast(MovmskVT, Match);
46363 } else {
46364 // Use combineBitcastvxi1 to create the MOVMSK.
46365 while (NumElts > MaxElts) {
46366 SDValue Lo, Hi;
46367 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46368 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46369 NumElts /= 2;
46370 }
46371 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46372 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
46373 }
46374 if (!Movmsk)
46375 return SDValue();
46376 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
46377 } else {
46378 // FIXME: Better handling of k-registers or 512-bit vectors?
46379 unsigned MatchSizeInBits = Match.getValueSizeInBits();
46380 if (!(MatchSizeInBits == 128 ||
46381 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
46382 return SDValue();
46383
46384 // Make sure this isn't a vector of 1 element. The perf win from using
46385 // MOVMSK diminishes with less elements in the reduction, but it is
46386 // generally better to get the comparison over to the GPRs as soon as
46387 // possible to reduce the number of vector ops.
46388 if (Match.getValueType().getVectorNumElements() < 2)
46389 return SDValue();
46390
46391 // Check that we are extracting a reduction of all sign bits.
46392 if (DAG.ComputeNumSignBits(Match) != BitWidth)
46393 return SDValue();
46394
46395 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
46396 SDValue Lo, Hi;
46397 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46398 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46399 MatchSizeInBits = Match.getValueSizeInBits();
46400 }
46401
46402 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
46403 MVT MaskSrcVT;
46404 if (64 == BitWidth || 32 == BitWidth)
46406 MatchSizeInBits / BitWidth);
46407 else
46408 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
46409
46410 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
46411 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
46412 NumElts = MaskSrcVT.getVectorNumElements();
46413 }
46414 assert((NumElts <= 32 || NumElts == 64) &&
46415 "Not expecting more than 64 elements");
46416
46417 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
46418 if (BinOp == ISD::XOR) {
46419 // parity -> (PARITY(MOVMSK X))
46420 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
46421 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
46422 }
46423
46424 SDValue CmpC;
46425 ISD::CondCode CondCode;
46426 if (BinOp == ISD::OR) {
46427 // any_of -> MOVMSK != 0
46428 CmpC = DAG.getConstant(0, DL, CmpVT);
46429 CondCode = ISD::CondCode::SETNE;
46430 } else {
46431 // all_of -> MOVMSK == ((1 << NumElts) - 1)
46432 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
46433 DL, CmpVT);
46434 CondCode = ISD::CondCode::SETEQ;
46435 }
46436
46437 // The setcc produces an i8 of 0/1, so extend that to the result width and
46438 // negate to get the final 0/-1 mask value.
46439 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
46440 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
46441 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
46442 return DAG.getNegative(Zext, DL, ExtractVT);
46443}
46444
46446 const X86Subtarget &Subtarget) {
46447 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
46448 return SDValue();
46449
46450 EVT ExtractVT = Extract->getValueType(0);
46451 // Verify the type we're extracting is i32, as the output element type of
46452 // vpdpbusd is i32.
46453 if (ExtractVT != MVT::i32)
46454 return SDValue();
46455
46456 EVT VT = Extract->getOperand(0).getValueType();
46458 return SDValue();
46459
46460 // Match shuffle + add pyramid.
46461 ISD::NodeType BinOp;
46462 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46463
46464 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
46465 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
46466 // before adding into the accumulator.
46467 // TODO:
46468 // We also need to verify that the multiply has at least 2x the number of bits
46469 // of the input. We shouldn't match
46470 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
46471 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
46472 // Root = Root.getOperand(0);
46473
46474 // If there was a match, we want Root to be a mul.
46475 if (!Root || Root.getOpcode() != ISD::MUL)
46476 return SDValue();
46477
46478 // Check whether we have an extend and mul pattern
46479 SDValue LHS, RHS;
46480 if (!detectExtMul(DAG, Root, LHS, RHS))
46481 return SDValue();
46482
46483 // Create the dot product instruction.
46484 SDLoc DL(Extract);
46485 unsigned StageBias;
46486 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
46487
46488 // If the original vector was wider than 4 elements, sum over the results
46489 // in the DP vector.
46490 unsigned Stages = Log2_32(VT.getVectorNumElements());
46491 EVT DpVT = DP.getValueType();
46492
46493 if (Stages > StageBias) {
46494 unsigned DpElems = DpVT.getVectorNumElements();
46495
46496 for (unsigned i = Stages - StageBias; i > 0; --i) {
46497 SmallVector<int, 16> Mask(DpElems, -1);
46498 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46499 Mask[j] = MaskEnd + j;
46500
46501 SDValue Shuffle =
46502 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
46503 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
46504 }
46505 }
46506
46507 // Return the lowest ExtractSizeInBits bits.
46508 EVT ResVT =
46509 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46510 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
46511 DP = DAG.getBitcast(ResVT, DP);
46512 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
46513 Extract->getOperand(1));
46514}
46515
46517 const X86Subtarget &Subtarget) {
46518 using namespace SDPatternMatch;
46519
46520 // PSADBW is only supported on SSE2 and up.
46521 if (!Subtarget.hasSSE2())
46522 return SDValue();
46523
46524 EVT ExtractVT = Extract->getValueType(0);
46525 if (ExtractVT != MVT::i8 && ExtractVT != MVT::i16 && ExtractVT != MVT::i32 &&
46526 ExtractVT != MVT::i64)
46527 return SDValue();
46528
46529 EVT VT = Extract->getOperand(0).getValueType();
46531 return SDValue();
46532
46533 // Match shuffle + add pyramid.
46534 ISD::NodeType BinOp;
46535 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46536 if (!Root)
46537 return SDValue();
46538
46539 // The operand is expected to be zero extended from i8.
46540 // In order to convert to i64 and above, additional any/zero/sign
46541 // extend is expected.
46542 // The zero extend from 32 bit has no mathematical effect on the result.
46543 // Also the sign extend is basically zero extend
46544 // (extends the sign bit which is zero).
46545 // So it is correct to skip the sign/zero extend instruction.
46546 if (Root.getOpcode() == ISD::SIGN_EXTEND ||
46547 Root.getOpcode() == ISD::ZERO_EXTEND ||
46548 Root.getOpcode() == ISD::ANY_EXTEND)
46549 Root = Root.getOperand(0);
46550
46551 // Check whether we have an vXi8 abdu pattern.
46552 // TODO: Just match ISD::ABDU once the DAG is topological sorted.
46553 SDValue Src0, Src1;
46554 if (!sd_match(
46555 Root,
46556 m_AnyOf(
46558 MVT::i8, m_c_BinOp(ISD::ABDU, m_Value(Src0), m_Value(Src1))),
46560 MVT::i8, m_Sub(m_UMax(m_Value(Src0), m_Value(Src1)),
46561 m_UMin(m_Deferred(Src0), m_Deferred(Src1)))),
46562 m_Abs(
46563 m_Sub(m_AllOf(m_Value(Src0),
46565 m_AllOf(m_Value(Src1),
46566 m_ZExt(m_SpecificVectorElementVT(MVT::i8))))))))
46567 return SDValue();
46568
46569 // Create the SAD instruction.
46570 SDLoc DL(Extract);
46571 SDValue SAD = createPSADBW(DAG, Src0, Src1, DL, Subtarget);
46572
46573 // If the original vector was wider than 8 elements, sum over the results
46574 // in the SAD vector.
46575 unsigned Stages = Log2_32(VT.getVectorNumElements());
46576 EVT SadVT = SAD.getValueType();
46577 if (Stages > 3) {
46578 unsigned SadElems = SadVT.getVectorNumElements();
46579
46580 for(unsigned i = Stages - 3; i > 0; --i) {
46581 SmallVector<int, 16> Mask(SadElems, -1);
46582 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46583 Mask[j] = MaskEnd + j;
46584
46585 SDValue Shuffle =
46586 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
46587 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
46588 }
46589 }
46590
46591 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
46592 // Return the lowest ExtractSizeInBits bits.
46593 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46594 SadVT.getSizeInBits() / ExtractSizeInBits);
46595 SAD = DAG.getBitcast(ResVT, SAD);
46596 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
46597 Extract->getOperand(1));
46598}
46599
46600// If this extract is from a loaded vector value and will be used as an
46601// integer, that requires a potentially expensive XMM -> GPR transfer.
46602// Additionally, if we can convert to a scalar integer load, that will likely
46603// be folded into a subsequent integer op.
46604// Note: SrcVec might not have a VecVT type, but it must be the same size.
46605// Note: Unlike the related fold for this in DAGCombiner, this is not limited
46606// to a single-use of the loaded vector. For the reasons above, we
46607// expect this to be profitable even if it creates an extra load.
46608static SDValue
46610 const SDLoc &dl, SelectionDAG &DAG,
46612 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46613 "Only EXTRACT_VECTOR_ELT supported so far");
46614
46615 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46616 EVT VT = N->getValueType(0);
46617
46618 bool LikelyUsedAsVector = any_of(N->users(), [](SDNode *Use) {
46619 return Use->getOpcode() == ISD::STORE ||
46620 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
46621 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
46622 });
46623
46624 auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
46625 if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
46626 VecVT.getVectorElementType() == VT &&
46627 VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
46628 DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
46629 SDValue NewPtr = TLI.getVectorElementPointer(
46630 DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
46631 unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
46632 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
46633 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
46634 SDValue Load =
46635 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
46636 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
46637 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
46638 return Load;
46639 }
46640
46641 return SDValue();
46642}
46643
46644// Attempt to peek through a target shuffle and extract the scalar from the
46645// source.
46648 const X86Subtarget &Subtarget) {
46649 if (DCI.isBeforeLegalizeOps())
46650 return SDValue();
46651
46652 SDLoc dl(N);
46653 SDValue Src = N->getOperand(0);
46654 SDValue Idx = N->getOperand(1);
46655
46656 EVT VT = N->getValueType(0);
46657 EVT SrcVT = Src.getValueType();
46658 EVT SrcSVT = SrcVT.getVectorElementType();
46659 unsigned SrcEltBits = SrcSVT.getSizeInBits();
46660 unsigned NumSrcElts = SrcVT.getVectorNumElements();
46661
46662 // Don't attempt this for boolean mask vectors or unknown extraction indices.
46663 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
46664 return SDValue();
46665
46666 const APInt &IdxC = N->getConstantOperandAPInt(1);
46667 if (IdxC.uge(NumSrcElts))
46668 return SDValue();
46669
46670 SDValue SrcBC = peekThroughBitcasts(Src);
46671
46672 // Handle extract(bitcast(broadcast(scalar_value))).
46673 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
46674 SDValue SrcOp = SrcBC.getOperand(0);
46675 EVT SrcOpVT = SrcOp.getValueType();
46676 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
46677 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
46678 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
46679 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
46680 // TODO support non-zero offsets.
46681 if (Offset == 0) {
46682 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
46683 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
46684 return SrcOp;
46685 }
46686 }
46687 }
46688
46689 // If we're extracting a single element from a broadcast load and there are
46690 // no other users, just create a single load.
46692 SrcBC.hasOneUse()) {
46693 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
46694 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
46695 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
46696 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
46697 SDValue Load =
46698 DAG.getLoad(VT, dl, MemIntr->getChain(), MemIntr->getBasePtr(),
46699 MemIntr->getPointerInfo(), MemIntr->getBaseAlign(),
46700 MemIntr->getMemOperand()->getFlags());
46701 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
46702 return Load;
46703 }
46704 }
46705
46706 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
46707 // TODO: Move to DAGCombine?
46708 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
46709 SrcBC.getValueType().isInteger() &&
46710 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
46711 SrcBC.getScalarValueSizeInBits() ==
46712 SrcBC.getOperand(0).getValueSizeInBits()) {
46713 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
46714 if (IdxC.ult(Scale)) {
46715 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
46716 SDValue Scl = SrcBC.getOperand(0);
46717 EVT SclVT = Scl.getValueType();
46718 if (Offset) {
46719 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
46720 DAG.getShiftAmountConstant(Offset, SclVT, dl));
46721 }
46722 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
46723 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
46724 return Scl;
46725 }
46726 }
46727
46728 // Handle extract(truncate(x)) for 0'th index.
46729 // TODO: Treat this as a faux shuffle?
46730 // TODO: When can we use this for general indices?
46731 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
46732 (SrcVT.getSizeInBits() % 128) == 0) {
46733 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
46734 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
46735 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
46736 Idx);
46737 }
46738
46739 // We can only legally extract other elements from 128-bit vectors and in
46740 // certain circumstances, depending on SSE-level.
46741 // TODO: Investigate float/double extraction if it will be just stored.
46742 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
46743 unsigned Idx) {
46744 EVT VecSVT = VecVT.getScalarType();
46745 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
46746 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
46747 VecSVT == MVT::i64)) {
46748 unsigned EltSizeInBits = VecSVT.getSizeInBits();
46749 unsigned NumEltsPerLane = 128 / EltSizeInBits;
46750 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
46751 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
46752 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
46753 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
46754 Idx &= (NumEltsPerLane - 1);
46755 }
46756 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
46757 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
46758 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
46759 DAG.getBitcast(VecVT, Vec),
46760 DAG.getVectorIdxConstant(Idx, dl));
46761 }
46762 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
46763 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
46764 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
46765 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
46766 DAG.getTargetConstant(Idx, dl, MVT::i8));
46767 }
46768 return SDValue();
46769 };
46770
46771 // Resolve the target shuffle inputs and mask.
46774 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
46775 return SDValue();
46776
46777 // Shuffle inputs must be the same size as the result.
46778 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
46779 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
46780 }))
46781 return SDValue();
46782
46783 // Attempt to narrow/widen the shuffle mask to the correct size.
46784 if (Mask.size() != NumSrcElts) {
46785 if ((NumSrcElts % Mask.size()) == 0) {
46786 SmallVector<int, 16> ScaledMask;
46787 int Scale = NumSrcElts / Mask.size();
46788 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
46789 Mask = std::move(ScaledMask);
46790 } else if ((Mask.size() % NumSrcElts) == 0) {
46791 // Simplify Mask based on demanded element.
46792 int ExtractIdx = (int)IdxC.getZExtValue();
46793 int Scale = Mask.size() / NumSrcElts;
46794 int Lo = Scale * ExtractIdx;
46795 int Hi = Scale * (ExtractIdx + 1);
46796 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
46797 if (i < Lo || Hi <= i)
46798 Mask[i] = SM_SentinelUndef;
46799
46800 SmallVector<int, 16> WidenedMask;
46801 while (Mask.size() > NumSrcElts &&
46802 canWidenShuffleElements(Mask, WidenedMask))
46803 Mask = std::move(WidenedMask);
46804 }
46805 }
46806
46807 // If narrowing/widening failed, see if we can extract+zero-extend.
46808 int ExtractIdx;
46809 EVT ExtractVT;
46810 if (Mask.size() == NumSrcElts) {
46811 ExtractIdx = Mask[IdxC.getZExtValue()];
46812 ExtractVT = SrcVT;
46813 } else {
46814 unsigned Scale = Mask.size() / NumSrcElts;
46815 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
46816 return SDValue();
46817 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
46818 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
46819 return SDValue();
46820 ExtractIdx = Mask[ScaledIdx];
46821 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
46822 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
46823 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
46824 "Failed to widen vector type");
46825 }
46826
46827 // If the shuffle source element is undef/zero then we can just accept it.
46828 if (ExtractIdx == SM_SentinelUndef)
46829 return DAG.getUNDEF(VT);
46830
46831 if (ExtractIdx == SM_SentinelZero)
46832 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
46833 : DAG.getConstant(0, dl, VT);
46834
46835 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
46836 ExtractIdx = ExtractIdx % Mask.size();
46837 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
46838 return DAG.getZExtOrTrunc(V, dl, VT);
46839
46840 if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
46842 N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
46843 return V;
46844
46845 return SDValue();
46846}
46847
46848/// Extracting a scalar FP value from vector element 0 is free, so extract each
46849/// operand first, then perform the math as a scalar op.
46851 const X86Subtarget &Subtarget,
46853 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
46854 SDValue Vec = ExtElt->getOperand(0);
46855 SDValue Index = ExtElt->getOperand(1);
46856 EVT VT = ExtElt->getValueType(0);
46857 EVT VecVT = Vec.getValueType();
46858
46859 // TODO: If this is a unary/expensive/expand op, allow extraction from a
46860 // non-zero element because the shuffle+scalar op will be cheaper?
46861 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
46862 return SDValue();
46863
46864 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
46865 // extract, the condition code), so deal with those as a special-case.
46866 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
46867 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
46868 if (OpVT != MVT::f32 && OpVT != MVT::f64)
46869 return SDValue();
46870
46871 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
46872 SDLoc DL(ExtElt);
46873 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46874 Vec.getOperand(0), Index);
46875 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46876 Vec.getOperand(1), Index);
46877 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
46878 }
46879
46880 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
46881 VT != MVT::f64)
46882 return SDValue();
46883
46884 // Vector FP selects don't fit the pattern of FP math ops (because the
46885 // condition has a different type and we have to change the opcode), so deal
46886 // with those here.
46887 // FIXME: This is restricted to pre type legalization. If we loosen this we
46888 // need to convert vector bool to a scalar bool.
46889 if (DCI.isBeforeLegalize() && Vec.getOpcode() == ISD::VSELECT &&
46890 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
46891 Vec.getOperand(0).getOperand(0).getValueType() == VecVT &&
46892 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1) {
46893 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
46894 SDLoc DL(ExtElt);
46897 Vec.getOperand(0), Index);
46898 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46899 Vec.getOperand(1), Index);
46900 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46901 Vec.getOperand(2), Index);
46902 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
46903 }
46904
46905 // TODO: This switch could include FNEG and the x86-specific FP logic ops
46906 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
46907 // missed load folding and fma+fneg combining.
46908 switch (Vec.getOpcode()) {
46909 case ISD::FMA: // Begin 3 operands
46910 case ISD::FMAD:
46911 case ISD::FADD: // Begin 2 operands
46912 case ISD::FSUB:
46913 case ISD::FMUL:
46914 case ISD::FDIV:
46915 case ISD::FREM:
46916 case ISD::FCOPYSIGN:
46917 case ISD::FMINNUM:
46918 case ISD::FMAXNUM:
46919 case ISD::FMINNUM_IEEE:
46920 case ISD::FMAXNUM_IEEE:
46921 case ISD::FMAXIMUM:
46922 case ISD::FMINIMUM:
46923 case ISD::FMAXIMUMNUM:
46924 case ISD::FMINIMUMNUM:
46925 case X86ISD::FMAX:
46926 case X86ISD::FMIN:
46927 case ISD::FABS: // Begin 1 operand
46928 case ISD::FSQRT:
46929 case ISD::FRINT:
46930 case ISD::FCEIL:
46931 case ISD::FTRUNC:
46932 case ISD::FNEARBYINT:
46933 case ISD::FROUNDEVEN:
46934 case ISD::FROUND:
46935 case ISD::FFLOOR:
46936 case X86ISD::FRCP:
46937 case X86ISD::FRSQRT: {
46938 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
46939 SDLoc DL(ExtElt);
46941 for (SDValue Op : Vec->ops())
46942 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
46943 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
46944 }
46945 default:
46946 return SDValue();
46947 }
46948 llvm_unreachable("All opcodes should return within switch");
46949}
46950
46951/// Try to convert a vector reduction sequence composed of binops and shuffles
46952/// into horizontal ops.
46954 const X86Subtarget &Subtarget) {
46955 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
46956
46957 // We need at least SSE2 to anything here.
46958 if (!Subtarget.hasSSE2())
46959 return SDValue();
46960
46962 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
46963 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
46964 if (!Rdx)
46965 return SDValue();
46966
46967 SDValue Index = ExtElt->getOperand(1);
46968 assert(isNullConstant(Index) &&
46969 "Reduction doesn't end in an extract from index 0");
46970
46971 EVT VT = ExtElt->getValueType(0);
46972 EVT VecVT = Rdx.getValueType();
46973 if (VecVT.getScalarType() != VT)
46974 return SDValue();
46975
46976 SDLoc DL(ExtElt);
46977 unsigned NumElts = VecVT.getVectorNumElements();
46978 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
46979
46980 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
46981 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
46982 if (V.getValueType() == MVT::v4i8) {
46983 if (ZeroExtend && Subtarget.hasSSE41()) {
46984 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
46985 DAG.getConstant(0, DL, MVT::v4i32),
46986 DAG.getBitcast(MVT::i32, V),
46987 DAG.getVectorIdxConstant(0, DL));
46988 return DAG.getBitcast(MVT::v16i8, V);
46989 }
46990 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
46991 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
46992 : DAG.getUNDEF(MVT::v4i8));
46993 }
46994 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
46995 DAG.getUNDEF(MVT::v8i8));
46996 };
46997
46998 // vXi8 mul reduction - promote to vXi16 mul reduction.
46999 if (Opc == ISD::MUL) {
47000 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
47001 return SDValue();
47002 if (VecVT.getSizeInBits() >= 128) {
47003 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
47004 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
47005 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
47006 Lo = DAG.getBitcast(WideVT, Lo);
47007 Hi = DAG.getBitcast(WideVT, Hi);
47008 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
47009 while (Rdx.getValueSizeInBits() > 128) {
47010 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47011 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
47012 }
47013 } else {
47014 Rdx = WidenToV16I8(Rdx, false);
47015 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
47016 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
47017 }
47018 if (NumElts >= 8)
47019 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47020 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47021 {4, 5, 6, 7, -1, -1, -1, -1}));
47022 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47023 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47024 {2, 3, -1, -1, -1, -1, -1, -1}));
47025 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47026 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47027 {1, -1, -1, -1, -1, -1, -1, -1}));
47028 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47029 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47030 }
47031
47032 // vXi8 add reduction - sub 128-bit vector.
47033 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
47034 Rdx = WidenToV16I8(Rdx, true);
47035 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
47036 DAG.getConstant(0, DL, MVT::v16i8));
47037 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47038 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47039 }
47040
47041 // Must be a >=128-bit vector with pow2 elements.
47042 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
47043 return SDValue();
47044
47045 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
47046 if (VT == MVT::i8) {
47047 while (Rdx.getValueSizeInBits() > 128) {
47048 SDValue Lo, Hi;
47049 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47050 VecVT = Lo.getValueType();
47051 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47052 }
47053 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
47054
47056 MVT::v16i8, DL, Rdx, Rdx,
47057 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
47058 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
47059 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
47060 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
47061 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47062 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47063 }
47064
47065 // See if we can use vXi8 PSADBW add reduction for larger zext types.
47066 // If the source vector values are 0-255, then we can use PSADBW to
47067 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
47068 // TODO: See if its worth avoiding vXi16/i32 truncations?
47069 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
47070 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
47071 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
47072 Subtarget.hasAVX512())) {
47073 if (Rdx.getValueType() == MVT::v8i16) {
47074 Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
47075 DAG.getUNDEF(MVT::v8i16));
47076 } else {
47077 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
47078 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
47079 if (ByteVT.getSizeInBits() < 128)
47080 Rdx = WidenToV16I8(Rdx, true);
47081 }
47082
47083 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
47084 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47086 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
47087 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
47088 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
47089 };
47090 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
47091 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
47092
47093 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
47094 while (Rdx.getValueSizeInBits() > 128) {
47095 SDValue Lo, Hi;
47096 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47097 VecVT = Lo.getValueType();
47098 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47099 }
47100 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
47101
47102 if (NumElts > 8) {
47103 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
47104 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
47105 }
47106
47107 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
47108 Rdx = DAG.getBitcast(VecVT, Rdx);
47109 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47110 }
47111
47112 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
47113 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
47114 return SDValue();
47115
47116 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
47117
47118 // 256-bit horizontal instructions operate on 128-bit chunks rather than
47119 // across the whole vector, so we need an extract + hop preliminary stage.
47120 // This is the only step where the operands of the hop are not the same value.
47121 // TODO: We could extend this to handle 512-bit or even longer vectors.
47122 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
47123 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
47124 unsigned NumElts = VecVT.getVectorNumElements();
47125 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
47126 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
47127 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
47128 VecVT = Rdx.getValueType();
47129 }
47130 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
47131 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
47132 return SDValue();
47133
47134 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
47135 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
47136 for (unsigned i = 0; i != ReductionSteps; ++i)
47137 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
47138
47139 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47140}
47141
47142/// Detect vector gather/scatter index generation and convert it from being a
47143/// bunch of shuffles and extracts into a somewhat faster sequence.
47144/// For i686, the best sequence is apparently storing the value and loading
47145/// scalars back, while for x64 we should use 64-bit extracts and shifts.
47148 const X86Subtarget &Subtarget) {
47149 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
47150 return NewOp;
47151
47152 SDValue InputVector = N->getOperand(0);
47153 SDValue EltIdx = N->getOperand(1);
47154 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
47155
47156 EVT SrcVT = InputVector.getValueType();
47157 EVT VT = N->getValueType(0);
47158 SDLoc dl(InputVector);
47159 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
47160 unsigned NumSrcElts = SrcVT.getVectorNumElements();
47161 unsigned NumEltBits = VT.getScalarSizeInBits();
47162 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47163
47164 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
47165 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47166
47167 // Integer Constant Folding.
47168 if (CIdx && VT.isInteger()) {
47169 APInt UndefVecElts;
47170 SmallVector<APInt, 16> EltBits;
47171 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
47172 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
47173 EltBits, /*AllowWholeUndefs*/ true,
47174 /*AllowPartialUndefs*/ false)) {
47175 uint64_t Idx = CIdx->getZExtValue();
47176 if (UndefVecElts[Idx])
47177 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47178 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
47179 }
47180
47181 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
47182 // Improves lowering of bool masks on rust which splits them into byte array.
47183 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
47184 SDValue Src = peekThroughBitcasts(InputVector);
47185 if (Src.getValueType().getScalarType() == MVT::i1 &&
47186 TLI.isTypeLegal(Src.getValueType())) {
47187 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
47188 SDValue Sub = DAG.getNode(
47189 ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
47190 DAG.getVectorIdxConstant(CIdx->getZExtValue() * NumEltBits, dl));
47191 return DAG.getBitcast(VT, Sub);
47192 }
47193 }
47194 }
47195
47196 if (IsPextr) {
47197 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
47198 DCI))
47199 return SDValue(N, 0);
47200
47201 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
47202 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
47203 InputVector.getOpcode() == X86ISD::PINSRW) &&
47204 InputVector.getOperand(2) == EltIdx) {
47205 assert(SrcVT == InputVector.getOperand(0).getValueType() &&
47206 "Vector type mismatch");
47207 SDValue Scl = InputVector.getOperand(1);
47208 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
47209 return DAG.getZExtOrTrunc(Scl, dl, VT);
47210 }
47211
47212 // TODO - Remove this once we can handle the implicit zero-extension of
47213 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
47214 // combineBasicSADPattern.
47215 return SDValue();
47216 }
47217
47218 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
47219 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
47220 InputVector.getOpcode() == ISD::BITCAST &&
47221 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47222 isNullConstant(EltIdx) && InputVector.hasOneUse())
47223 return DAG.getBitcast(VT, InputVector);
47224
47225 // Detect mmx to i32 conversion through a v2i32 elt extract.
47226 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
47227 InputVector.getOpcode() == ISD::BITCAST &&
47228 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47229 isNullConstant(EltIdx) && InputVector.hasOneUse())
47230 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
47231 InputVector.getOperand(0));
47232
47233 // Check whether this extract is the root of a sum of absolute differences
47234 // pattern. This has to be done here because we really want it to happen
47235 // pre-legalization,
47236 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
47237 return SAD;
47238
47239 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
47240 return VPDPBUSD;
47241
47242 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
47243 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
47244 return Cmp;
47245
47246 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
47247 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
47248 return MinMax;
47249
47250 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
47251 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
47252 return V;
47253
47254 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget, DCI))
47255 return V;
47256
47257 if (CIdx)
47259 N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
47260 dl, DAG, DCI))
47261 return V;
47262
47263 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
47264 // and then testing the relevant element.
47265 //
47266 // Note that we only combine extracts on the *same* result number, i.e.
47267 // t0 = merge_values a0, a1, a2, a3
47268 // i1 = extract_vector_elt t0, Constant:i64<2>
47269 // i1 = extract_vector_elt t0, Constant:i64<3>
47270 // but not
47271 // i1 = extract_vector_elt t0:1, Constant:i64<2>
47272 // since the latter would need its own MOVMSK.
47273 if (SrcVT.getScalarType() == MVT::i1) {
47274 bool IsVar = !CIdx;
47275 SmallVector<SDNode *, 16> BoolExtracts;
47276 unsigned ResNo = InputVector.getResNo();
47277 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
47278 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
47279 Use->getOperand(0).getResNo() == ResNo &&
47280 Use->getValueType(0) == MVT::i1) {
47281 BoolExtracts.push_back(Use);
47282 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
47283 return true;
47284 }
47285 return false;
47286 };
47287 // TODO: Can we drop the oneuse check for constant extracts?
47288 if (all_of(InputVector->users(), IsBoolExtract) &&
47289 (IsVar || BoolExtracts.size() > 1)) {
47290 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
47291 if (SDValue BC =
47292 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
47293 for (SDNode *Use : BoolExtracts) {
47294 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
47295 // Mask = 1 << MaskIdx
47296 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
47297 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
47298 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
47299 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
47300 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
47301 DCI.CombineTo(Use, Res);
47302 }
47303 return SDValue(N, 0);
47304 }
47305 }
47306 }
47307
47308 // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
47309 if (CIdx && InputVector.getOpcode() == ISD::TRUNCATE) {
47310 SDValue TruncSrc = InputVector.getOperand(0);
47311 EVT TruncSVT = TruncSrc.getValueType().getScalarType();
47312 if (DCI.isBeforeLegalize() && TLI.isTypeLegal(TruncSVT)) {
47313 SDValue NewExt =
47314 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TruncSVT, TruncSrc, EltIdx);
47315 return DAG.getAnyExtOrTrunc(NewExt, dl, VT);
47316 }
47317 }
47318
47319 return SDValue();
47320}
47321
47322// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
47323// This is more or less the reverse of combineBitcastvxi1.
47325 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
47326 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
47327 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
47328 Opcode != ISD::ANY_EXTEND)
47329 return SDValue();
47330 if (!DCI.isBeforeLegalizeOps())
47331 return SDValue();
47332 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
47333 return SDValue();
47334
47335 EVT SVT = VT.getScalarType();
47336 EVT InSVT = N0.getValueType().getScalarType();
47337 unsigned EltSizeInBits = SVT.getSizeInBits();
47338
47339 // Input type must be extending a bool vector (bit-casted from a scalar
47340 // integer) to legal integer types.
47341 if (!VT.isVector())
47342 return SDValue();
47343 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
47344 return SDValue();
47345 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
47346 return SDValue();
47347
47348 SDValue N00 = N0.getOperand(0);
47349 EVT SclVT = N00.getValueType();
47350 if (!SclVT.isScalarInteger())
47351 return SDValue();
47352
47353 SDValue Vec;
47354 SmallVector<int> ShuffleMask;
47355 unsigned NumElts = VT.getVectorNumElements();
47356 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
47357
47358 // Broadcast the scalar integer to the vector elements.
47359 if (NumElts > EltSizeInBits) {
47360 // If the scalar integer is greater than the vector element size, then we
47361 // must split it down into sub-sections for broadcasting. For example:
47362 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
47363 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
47364 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
47365 unsigned Scale = NumElts / EltSizeInBits;
47366 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
47367 bool UseBroadcast = Subtarget.hasInt256() &&
47368 (!BroadcastVT.is128BitVector() || isa<LoadSDNode>(N00));
47369 Vec = UseBroadcast
47370 ? DAG.getSplat(BroadcastVT, DL, N00)
47371 : DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
47372 Vec = DAG.getBitcast(VT, Vec);
47373
47374 for (unsigned i = 0; i != Scale; ++i) {
47375 int Offset = UseBroadcast ? (i * EltSizeInBits) : 0;
47376 ShuffleMask.append(EltSizeInBits, i + Offset);
47377 }
47378 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
47379 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
47380 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
47381 // If we have register broadcast instructions, use the scalar size as the
47382 // element type for the shuffle. Then cast to the wider element type. The
47383 // widened bits won't be used, and this might allow the use of a broadcast
47384 // load.
47385 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
47386 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT,
47387 (NumElts * EltSizeInBits) / NumElts);
47388 Vec = DAG.getBitcast(VT, DAG.getSplat(BroadcastVT, DL, N00));
47389 } else {
47390 // For smaller scalar integers, we can simply any-extend it to the vector
47391 // element size (we don't care about the upper bits) and broadcast it to all
47392 // elements.
47393 Vec = DAG.getSplat(VT, DL, DAG.getAnyExtOrTrunc(N00, DL, SVT));
47394 }
47395
47396 // Now, mask the relevant bit in each element.
47398 for (unsigned i = 0; i != NumElts; ++i) {
47399 int BitIdx = (i % EltSizeInBits);
47400 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
47401 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
47402 }
47403 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
47404 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
47405
47406 // Compare against the bitmask and extend the result.
47407 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
47408 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
47409 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
47410
47411 // For SEXT, this is now done, otherwise shift the result down for
47412 // zero-extension.
47413 if (Opcode == ISD::SIGN_EXTEND)
47414 return Vec;
47415 return DAG.getNode(ISD::SRL, DL, VT, Vec,
47416 DAG.getConstant(EltSizeInBits - 1, DL, VT));
47417}
47418
47419/// If both arms of a vector select are concatenated vectors, split the select,
47420/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
47421/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
47422/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
47424 const X86Subtarget &Subtarget) {
47425 unsigned Opcode = N->getOpcode();
47426 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
47427 return SDValue();
47428
47429 // TODO: Split 512-bit vectors too?
47430 EVT VT = N->getValueType(0);
47431 if (!VT.is256BitVector())
47432 return SDValue();
47433
47434 // TODO: Split as long as any 2 of the 3 operands are concatenated?
47435 SDValue Cond = N->getOperand(0);
47436 SDValue TVal = N->getOperand(1);
47437 SDValue FVal = N->getOperand(2);
47438 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
47439 !isFreeToSplitVector(TVal, DAG) || !isFreeToSplitVector(FVal, DAG))
47440 return SDValue();
47441
47442 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
47444 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
47445 };
47446 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Cond, TVal, FVal}, makeBlend,
47447 /*CheckBWI*/ false);
47448}
47449
47451 const SDLoc &DL) {
47452 SDValue Cond = N->getOperand(0);
47453 SDValue LHS = N->getOperand(1);
47454 SDValue RHS = N->getOperand(2);
47455
47456 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
47457 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
47458 if (!TrueC || !FalseC)
47459 return SDValue();
47460
47461 // Don't do this for crazy integer types.
47462 EVT VT = N->getValueType(0);
47463 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
47464 return SDValue();
47465
47466 // We're going to use the condition bit in math or logic ops. We could allow
47467 // this with a wider condition value (post-legalization it becomes an i8),
47468 // but if nothing is creating selects that late, it doesn't matter.
47469 if (Cond.getValueType() != MVT::i1)
47470 return SDValue();
47471
47472 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
47473 // 3, 5, or 9 with i32/i64, so those get transformed too.
47474 // TODO: For constants that overflow or do not differ by power-of-2 or small
47475 // multiplier, convert to 'and' + 'add'.
47476 const APInt &TrueVal = TrueC->getAPIntValue();
47477 const APInt &FalseVal = FalseC->getAPIntValue();
47478
47479 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
47480 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
47481 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
47482 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47483 if (CC == ISD::SETEQ || CC == ISD::SETNE)
47484 return SDValue();
47485 }
47486
47487 bool OV;
47488 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
47489 if (OV)
47490 return SDValue();
47491
47492 APInt AbsDiff = Diff.abs();
47493 if (AbsDiff.isPowerOf2() ||
47494 ((VT == MVT::i32 || VT == MVT::i64) &&
47495 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
47496
47497 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
47498 // of the condition can usually be folded into a compare predicate, but even
47499 // without that, the sequence should be cheaper than a CMOV alternative.
47500 if (TrueVal.slt(FalseVal)) {
47501 Cond = DAG.getNOT(DL, Cond, MVT::i1);
47502 std::swap(TrueC, FalseC);
47503 }
47504
47505 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
47506 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
47507
47508 // Multiply condition by the difference if non-one.
47509 if (!AbsDiff.isOne())
47510 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
47511
47512 // Add the base if non-zero.
47513 if (!FalseC->isZero())
47514 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
47515
47516 return R;
47517 }
47518
47519 return SDValue();
47520}
47521
47522/// If this is a *dynamic* select (non-constant condition) and we can match
47523/// this node with one of the variable blend instructions, restructure the
47524/// condition so that blends can use the high (sign) bit of each element.
47525/// This function will also call SimplifyDemandedBits on already created
47526/// BLENDV to perform additional simplifications.
47528 const SDLoc &DL,
47530 const X86Subtarget &Subtarget) {
47531 SDValue Cond = N->getOperand(0);
47532 if ((N->getOpcode() != ISD::VSELECT &&
47533 N->getOpcode() != X86ISD::BLENDV) ||
47535 return SDValue();
47536
47537 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47538 unsigned BitWidth = Cond.getScalarValueSizeInBits();
47539 EVT VT = N->getValueType(0);
47540
47541 // We can only handle the cases where VSELECT is directly legal on the
47542 // subtarget. We custom lower VSELECT nodes with constant conditions and
47543 // this makes it hard to see whether a dynamic VSELECT will correctly
47544 // lower, so we both check the operation's status and explicitly handle the
47545 // cases where a *dynamic* blend will fail even though a constant-condition
47546 // blend could be custom lowered.
47547 // FIXME: We should find a better way to handle this class of problems.
47548 // Potentially, we should combine constant-condition vselect nodes
47549 // pre-legalization into shuffles and not mark as many types as custom
47550 // lowered.
47552 return SDValue();
47553 // FIXME: We don't support i16-element blends currently. We could and
47554 // should support them by making *all* the bits in the condition be set
47555 // rather than just the high bit and using an i8-element blend.
47556 if (VT.getVectorElementType() == MVT::i16)
47557 return SDValue();
47558 // Dynamic blending was only available from SSE4.1 onward.
47559 if (VT.is128BitVector() && !Subtarget.hasSSE41())
47560 return SDValue();
47561 // Byte blends are only available in AVX2
47562 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
47563 return SDValue();
47564 // There are no 512-bit blend instructions that use sign bits.
47565 if (VT.is512BitVector())
47566 return SDValue();
47567
47568 // Don't optimize before the condition has been transformed to a legal type
47569 // and don't ever optimize vector selects that map to AVX512 mask-registers.
47571 return SDValue();
47572
47573 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
47574 for (SDUse &Use : Cond->uses())
47575 if ((Use.getUser()->getOpcode() != ISD::VSELECT &&
47576 Use.getUser()->getOpcode() != X86ISD::BLENDV) ||
47577 Use.getOperandNo() != 0)
47578 return false;
47579
47580 return true;
47581 };
47582
47584
47585 if (OnlyUsedAsSelectCond(Cond)) {
47586 KnownBits Known;
47588 !DCI.isBeforeLegalizeOps());
47589 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
47590 return SDValue();
47591
47592 // If we changed the computation somewhere in the DAG, this change will
47593 // affect all users of Cond. Update all the nodes so that we do not use
47594 // the generic VSELECT anymore. Otherwise, we may perform wrong
47595 // optimizations as we messed with the actual expectation for the vector
47596 // boolean values.
47597 for (SDNode *U : Cond->users()) {
47598 if (U->getOpcode() == X86ISD::BLENDV)
47599 continue;
47600
47601 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
47602 Cond, U->getOperand(1), U->getOperand(2));
47603 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
47604 DCI.AddToWorklist(U);
47605 }
47606 DCI.CommitTargetLoweringOpt(TLO);
47607 return SDValue(N, 0);
47608 }
47609
47610 // Otherwise we can still at least try to simplify multiple use bits.
47612 return DAG.getNode(X86ISD::BLENDV, DL, N->getValueType(0), V,
47613 N->getOperand(1), N->getOperand(2));
47614
47615 return SDValue();
47616}
47617
47618// Try to match:
47619// (or (and (M, (sub 0, X)), (pandn M, X)))
47620// which is a special case of:
47621// (select M, (sub 0, X), X)
47622// Per:
47623// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
47624// We know that, if fNegate is 0 or 1:
47625// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
47626//
47627// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
47628// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
47629// ( M ? -X : X) == ((X ^ M ) + (M & 1))
47630// This lets us transform our vselect to:
47631// (add (xor X, M), (and M, 1))
47632// And further to:
47633// (sub (xor X, M), M)
47635 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
47636 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
47637 using namespace SDPatternMatch;
47638 EVT MaskVT = Mask.getValueType();
47639 assert(MaskVT.isInteger() &&
47640 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
47641 "Mask must be zero/all-bits");
47642
47643 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT ||
47645 return SDValue();
47646
47647 SDValue V;
47648 if (!sd_match(Y, m_Neg(m_AllOf(m_Specific(X), m_Value(V)))) &&
47650 return SDValue();
47651
47652 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
47653 SDValue SubOp2 = Mask;
47654
47655 // If the negate was on the false side of the select, then
47656 // the operands of the SUB need to be swapped. PR 27251.
47657 // This is because the pattern being matched above is
47658 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
47659 // but if the pattern matched was
47660 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
47661 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
47662 // pattern also needs to be a negation of the replacement pattern above.
47663 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
47664 // sub accomplishes the negation of the replacement pattern.
47665 if (V == Y)
47666 std::swap(SubOp1, SubOp2);
47667
47668 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
47669 return DAG.getBitcast(VT, Res);
47670}
47671
47673 const X86Subtarget &Subtarget) {
47674 using namespace SDPatternMatch;
47675 if (!Subtarget.hasAVX512())
47676 return SDValue();
47677
47678 ISD::CondCode CC;
47679 SDValue Cond, X, Y, LHS, RHS;
47682 m_CondCode(CC)))),
47683 m_Value(LHS), m_Value(RHS))))
47684 return SDValue();
47685
47686 if (canCombineAsMaskOperation(LHS, Subtarget) ||
47687 !canCombineAsMaskOperation(RHS, Subtarget))
47688 return SDValue();
47689
47690 // Commute LHS and RHS to create opportunity to select mask instruction.
47691 // (vselect M, L, R) -> (vselect ~M, R, L)
47692 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, X.getValueType());
47693 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), X, Y, NewCC);
47694 return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
47695}
47696
47697/// Do target-specific dag combines on SELECT and VSELECT nodes.
47700 const X86Subtarget &Subtarget) {
47701 SDLoc DL(N);
47702 SDValue Cond = N->getOperand(0);
47703 SDValue LHS = N->getOperand(1);
47704 SDValue RHS = N->getOperand(2);
47705
47706 // Try simplification again because we use this function to optimize
47707 // BLENDV nodes that are not handled by the generic combiner.
47708 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
47709 return V;
47710
47711 // When avx512 is available the lhs operand of select instruction can be
47712 // folded with mask instruction, while the rhs operand can't. Commute the
47713 // lhs and rhs of the select instruction to create the opportunity of
47714 // folding.
47715 if (SDValue V = commuteSelect(N, DAG, DL, Subtarget))
47716 return V;
47717
47718 EVT VT = LHS.getValueType();
47719 EVT CondVT = Cond.getValueType();
47720 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47721 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
47722
47723 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
47724 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
47725 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
47726 if (CondVT.isVector() && CondVT.isInteger() &&
47727 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
47728 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
47731 DL, DAG, Subtarget))
47732 return V;
47733
47734 if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
47735 SmallVector<int, 64> CondMask;
47736 if (createShuffleMaskFromVSELECT(CondMask, Cond,
47737 N->getOpcode() == X86ISD::BLENDV)) {
47738 // Convert vselects with constant condition into shuffles.
47739 if (DCI.isBeforeLegalizeOps())
47740 return DAG.getVectorShuffle(VT, DL, LHS, RHS, CondMask);
47741
47742 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
47743 // by forcing the unselected elements to zero.
47744 // TODO: Can we handle more shuffles with this?
47745 if (LHS.hasOneUse() && RHS.hasOneUse()) {
47746 SmallVector<SDValue, 1> LHSOps, RHSOps;
47747 SmallVector<int, 64> LHSMask, RHSMask, ByteMask;
47750 if (LHSShuf.getOpcode() == X86ISD::PSHUFB &&
47751 RHSShuf.getOpcode() == X86ISD::PSHUFB &&
47752 scaleShuffleMaskElts(VT.getSizeInBits() / 8, CondMask, ByteMask) &&
47753 getTargetShuffleMask(LHSShuf, true, LHSOps, LHSMask) &&
47754 getTargetShuffleMask(RHSShuf, true, RHSOps, RHSMask)) {
47755 assert(ByteMask.size() == LHSMask.size() &&
47756 ByteMask.size() == RHSMask.size() && "Shuffle mask mismatch");
47757 for (auto [I, M] : enumerate(ByteMask)) {
47758 // getConstVector sets negative shuffle mask values as undef, so
47759 // ensure we hardcode SM_SentinelZero values to zero (0x80).
47760 if (M < (int)ByteMask.size()) {
47761 LHSMask[I] = isUndefOrZero(LHSMask[I]) ? 0x80 : LHSMask[I];
47762 RHSMask[I] = 0x80;
47763 } else {
47764 LHSMask[I] = 0x80;
47765 RHSMask[I] = isUndefOrZero(RHSMask[I]) ? 0x80 : RHSMask[I];
47766 }
47767 }
47768 MVT ByteVT = LHSShuf.getSimpleValueType();
47769 LHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, LHSOps[0],
47770 getConstVector(LHSMask, ByteVT, DAG, DL, true));
47771 RHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, RHSOps[0],
47772 getConstVector(RHSMask, ByteVT, DAG, DL, true));
47773 return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, ByteVT, LHS, RHS));
47774 }
47775 }
47776
47777 // Attempt to combine as shuffle.
47778 SDValue Op(N, 0);
47779 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47780 return Res;
47781 }
47782 }
47783
47784 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
47785 // instructions match the semantics of the common C idiom x<y?x:y but not
47786 // x<=y?x:y, because of how they handle negative zero (which can be
47787 // ignored in unsafe-math mode).
47788 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
47789 if ((Cond.getOpcode() == ISD::SETCC ||
47790 Cond.getOpcode() == ISD::STRICT_FSETCCS) &&
47791 VT.isFloatingPoint() && VT != MVT::f80 && VT != MVT::f128 &&
47792 !isSoftF16(VT, Subtarget) && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
47793 ((VT != MVT::v8f16 && VT != MVT::v16f16) || Subtarget.hasVLX()) &&
47794 (Subtarget.hasSSE2() ||
47795 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
47796 bool IsStrict = Cond->isStrictFPOpcode();
47797 ISD::CondCode CC =
47798 cast<CondCodeSDNode>(Cond.getOperand(IsStrict ? 3 : 2))->get();
47799 SDValue Op0 = Cond.getOperand(IsStrict ? 1 : 0);
47800 SDValue Op1 = Cond.getOperand(IsStrict ? 2 : 1);
47801
47802 unsigned Opcode = 0;
47803 // Check for x CC y ? x : y.
47804 if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) {
47805 switch (CC) {
47806 default: break;
47807 case ISD::SETULT:
47808 // Converting this to a min would handle NaNs incorrectly, and swapping
47809 // the operands would cause it to handle comparisons between positive
47810 // and negative zero incorrectly.
47811 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47813 !(DAG.isKnownNeverZeroFloat(LHS) ||
47815 break;
47816 std::swap(LHS, RHS);
47817 }
47818 Opcode = X86ISD::FMIN;
47819 break;
47820 case ISD::SETOLE:
47821 // Converting this to a min would handle comparisons between positive
47822 // and negative zero incorrectly.
47825 break;
47826 Opcode = X86ISD::FMIN;
47827 break;
47828 case ISD::SETULE:
47829 // Converting this to a min would handle both negative zeros and NaNs
47830 // incorrectly, but we can swap the operands to fix both.
47831 std::swap(LHS, RHS);
47832 [[fallthrough]];
47833 case ISD::SETOLT:
47834 case ISD::SETLT:
47835 case ISD::SETLE:
47836 Opcode = X86ISD::FMIN;
47837 break;
47838
47839 case ISD::SETOGE:
47840 // Converting this to a max would handle comparisons between positive
47841 // and negative zero incorrectly.
47844 break;
47845 Opcode = X86ISD::FMAX;
47846 break;
47847 case ISD::SETUGT:
47848 // Converting this to a max would handle NaNs incorrectly, and swapping
47849 // the operands would cause it to handle comparisons between positive
47850 // and negative zero incorrectly.
47851 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47853 !(DAG.isKnownNeverZeroFloat(LHS) ||
47855 break;
47856 std::swap(LHS, RHS);
47857 }
47858 Opcode = X86ISD::FMAX;
47859 break;
47860 case ISD::SETUGE:
47861 // Converting this to a max would handle both negative zeros and NaNs
47862 // incorrectly, but we can swap the operands to fix both.
47863 std::swap(LHS, RHS);
47864 [[fallthrough]];
47865 case ISD::SETOGT:
47866 case ISD::SETGT:
47867 case ISD::SETGE:
47868 Opcode = X86ISD::FMAX;
47869 break;
47870 }
47871 // Check for x CC y ? y : x -- a min/max with reversed arms.
47872 } else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) {
47873 switch (CC) {
47874 default: break;
47875 case ISD::SETOGE:
47876 // Converting this to a min would handle comparisons between positive
47877 // and negative zero incorrectly, and swapping the operands would
47878 // cause it to handle NaNs incorrectly.
47880 !(DAG.isKnownNeverZeroFloat(LHS) ||
47881 DAG.isKnownNeverZeroFloat(RHS))) {
47882 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47883 break;
47884 std::swap(LHS, RHS);
47885 }
47886 Opcode = X86ISD::FMIN;
47887 break;
47888 case ISD::SETUGT:
47889 // Converting this to a min would handle NaNs incorrectly.
47890 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47891 break;
47892 Opcode = X86ISD::FMIN;
47893 break;
47894 case ISD::SETUGE:
47895 // Converting this to a min would handle both negative zeros and NaNs
47896 // incorrectly, but we can swap the operands to fix both.
47897 std::swap(LHS, RHS);
47898 [[fallthrough]];
47899 case ISD::SETOGT:
47900 case ISD::SETGT:
47901 case ISD::SETGE:
47902 Opcode = X86ISD::FMIN;
47903 break;
47904
47905 case ISD::SETULT:
47906 // Converting this to a max would handle NaNs incorrectly.
47907 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47908 break;
47909 Opcode = X86ISD::FMAX;
47910 break;
47911 case ISD::SETOLE:
47912 // Converting this to a max would handle comparisons between positive
47913 // and negative zero incorrectly, and swapping the operands would
47914 // cause it to handle NaNs incorrectly.
47916 !DAG.isKnownNeverZeroFloat(LHS) &&
47917 !DAG.isKnownNeverZeroFloat(RHS)) {
47918 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47919 break;
47920 std::swap(LHS, RHS);
47921 }
47922 Opcode = X86ISD::FMAX;
47923 break;
47924 case ISD::SETULE:
47925 // Converting this to a max would handle both negative zeros and NaNs
47926 // incorrectly, but we can swap the operands to fix both.
47927 std::swap(LHS, RHS);
47928 [[fallthrough]];
47929 case ISD::SETOLT:
47930 case ISD::SETLT:
47931 case ISD::SETLE:
47932 Opcode = X86ISD::FMAX;
47933 break;
47934 }
47935 }
47936
47937 if (Opcode) {
47938 if (IsStrict) {
47939 SDValue Ret = DAG.getNode(Opcode == X86ISD::FMIN ? X86ISD::STRICT_FMIN
47941 DL, {N->getValueType(0), MVT::Other},
47942 {Cond.getOperand(0), LHS, RHS});
47943 DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Ret.getValue(1));
47944 return Ret;
47945 }
47946 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
47947 }
47948 }
47949
47950 // Some mask scalar intrinsics rely on checking if only one bit is set
47951 // and implement it in C code like this:
47952 // A[0] = (U & 1) ? A[0] : W[0];
47953 // This creates some redundant instructions that break pattern matching.
47954 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
47955 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
47956 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
47957 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47958 SDValue AndNode = Cond.getOperand(0);
47959 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
47960 isNullConstant(Cond.getOperand(1)) &&
47961 isOneConstant(AndNode.getOperand(1))) {
47962 // LHS and RHS swapped due to
47963 // setcc outputting 1 when AND resulted in 0 and vice versa.
47964 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
47965 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
47966 }
47967 }
47968
47969 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
47970 // lowering on KNL. In this case we convert it to
47971 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
47972 // The same situation all vectors of i8 and i16 without BWI.
47973 // Make sure we extend these even before type legalization gets a chance to
47974 // split wide vectors.
47975 // Since SKX these selects have a proper lowering.
47976 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
47977 CondVT.getVectorElementType() == MVT::i1 &&
47978 (VT.getVectorElementType() == MVT::i8 ||
47979 VT.getVectorElementType() == MVT::i16)) {
47980 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
47981 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
47982 }
47983
47984 // AVX512 - Extend select to merge with target shuffle.
47985 // select(mask, extract_subvector(shuffle(x)), y) -->
47986 // extract_subvector(select(widen(mask), shuffle(x), widen(y)))
47987 // TODO - support non target shuffles as well with canCombineAsMaskOperation.
47988 if (Subtarget.hasAVX512() && CondVT.isVector() &&
47989 CondVT.getVectorElementType() == MVT::i1) {
47990 auto SelectableOp = [&TLI](SDValue Op, SDValue Alt) {
47991 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
47992 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
47993 isNullConstant(Op.getOperand(1)) &&
47994 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
47995 Op.hasOneUse() && Op.getOperand(0).hasOneUse() &&
47996 (Op.getOperand(0).getOpcode() != X86ISD::VPERMV3 ||
47997 ISD::isBuildVectorAllZeros(Alt.getNode()));
47998 };
47999
48000 bool SelectableLHS = SelectableOp(LHS, RHS);
48001 bool SelectableRHS = SelectableOp(RHS, LHS);
48002 if (SelectableLHS || SelectableRHS) {
48003 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
48004 : RHS.getOperand(0).getValueType();
48005 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
48006 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
48007 VT.getSizeInBits());
48008 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
48009 VT.getSizeInBits());
48010 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
48011 DAG.getUNDEF(SrcCondVT), Cond,
48012 DAG.getVectorIdxConstant(0, DL));
48013 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
48014 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
48015 }
48016 }
48017
48018 if (SDValue V = combineSelectOfTwoConstants(N, DAG, DL))
48019 return V;
48020
48021 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
48022 Cond.hasOneUse()) {
48023 EVT CondVT = Cond.getValueType();
48024 SDValue Cond0 = Cond.getOperand(0);
48025 SDValue Cond1 = Cond.getOperand(1);
48026 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
48027
48028 // Canonicalize min/max:
48029 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
48030 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
48031 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
48032 // the need for an extra compare against zero. e.g.
48033 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
48034 // subl %esi, %edi
48035 // testl %edi, %edi
48036 // movl $0, %eax
48037 // cmovgl %edi, %eax
48038 // =>
48039 // xorl %eax, %eax
48040 // subl %esi, $edi
48041 // cmovsl %eax, %edi
48042 //
48043 // We can also canonicalize
48044 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
48045 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
48046 // This allows the use of a test instruction for the compare.
48047 if (LHS == Cond0 && RHS == Cond1) {
48048 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
48049 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
48051 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48052 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48053 }
48054 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
48055 ISD::CondCode NewCC = ISD::SETUGE;
48056 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48057 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48058 }
48059 }
48060
48061 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
48062 // fold eq + gt/lt nested selects into ge/le selects
48063 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
48064 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
48065 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
48066 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
48067 // .. etc ..
48068 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
48069 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
48070 SDValue InnerSetCC = RHS.getOperand(0);
48071 ISD::CondCode InnerCC =
48072 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
48073 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
48074 Cond0 == InnerSetCC.getOperand(0) &&
48075 Cond1 == InnerSetCC.getOperand(1)) {
48076 ISD::CondCode NewCC;
48077 switch (CC == ISD::SETEQ ? InnerCC : CC) {
48078 // clang-format off
48079 case ISD::SETGT: NewCC = ISD::SETGE; break;
48080 case ISD::SETLT: NewCC = ISD::SETLE; break;
48081 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
48082 case ISD::SETULT: NewCC = ISD::SETULE; break;
48083 default: NewCC = ISD::SETCC_INVALID; break;
48084 // clang-format on
48085 }
48086 if (NewCC != ISD::SETCC_INVALID) {
48087 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
48088 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
48089 }
48090 }
48091 }
48092 }
48093
48094 // Check if the first operand is all zeros and Cond type is vXi1.
48095 // If this an avx512 target we can improve the use of zero masking by
48096 // swapping the operands and inverting the condition.
48097 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
48098 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
48099 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
48100 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
48101 // Invert the cond to not(cond) : xor(op,allones)=not(op)
48102 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
48103 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
48104 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
48105 }
48106
48107 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
48108 // get split by legalization.
48109 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
48110 CondVT.getVectorElementType() == MVT::i1 &&
48111 TLI.isTypeLegal(VT.getScalarType())) {
48112 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
48114 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
48115 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
48116 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
48117 }
48118 }
48119
48120 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
48121 // with out-of-bounds clamping.
48122
48123 // Unlike general shift instructions (SHL/SRL), AVX2's VSHLV/VSRLV handle
48124 // shift amounts exceeding the element bitwidth. VSHLV/VSRLV clamps the amount
48125 // to bitwidth-1 for unsigned shifts, effectively performing a maximum left
48126 // shift of bitwidth-1 positions. and returns zero for unsigned right shifts
48127 // exceeding bitwidth-1.
48128 if (N->getOpcode() == ISD::VSELECT) {
48129 using namespace llvm::SDPatternMatch;
48130 // fold select(icmp_ult(amt,BW),shl(x,amt),0) -> avx2 psllv(x,amt)
48131 // fold select(icmp_ult(amt,BW),srl(x,amt),0) -> avx2 psrlv(x,amt)
48132 if ((LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SHL) &&
48133 supportedVectorVarShift(VT, Subtarget, LHS.getOpcode()) &&
48135 sd_match(Cond, m_SetCC(m_Specific(LHS.getOperand(1)),
48138 return DAG.getNode(LHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48139 : X86ISD::VSHLV,
48140 DL, VT, LHS.getOperand(0), LHS.getOperand(1));
48141 }
48142 // fold select(icmp_uge(amt,BW),0,shl(x,amt)) -> avx2 psllv(x,amt)
48143 // fold select(icmp_uge(amt,BW),0,srl(x,amt)) -> avx2 psrlv(x,amt)
48144 if ((RHS.getOpcode() == ISD::SRL || RHS.getOpcode() == ISD::SHL) &&
48145 supportedVectorVarShift(VT, Subtarget, RHS.getOpcode()) &&
48147 sd_match(Cond, m_SetCC(m_Specific(RHS.getOperand(1)),
48150 return DAG.getNode(RHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48151 : X86ISD::VSHLV,
48152 DL, VT, RHS.getOperand(0), RHS.getOperand(1));
48153 }
48154 }
48155
48156 // Early exit check
48157 if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
48158 return SDValue();
48159
48160 if (SDValue V = combineVSelectToBLENDV(N, DAG, DL, DCI, Subtarget))
48161 return V;
48162
48163 if (SDValue V = narrowVectorSelect(N, DAG, DL, Subtarget))
48164 return V;
48165
48166 // select(~Cond, X, Y) -> select(Cond, Y, X)
48167 if (CondVT.getScalarType() != MVT::i1) {
48168 if (SDValue CondNot = IsNOT(Cond, DAG))
48169 return DAG.getNode(N->getOpcode(), DL, VT,
48170 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
48171
48172 // select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
48173 if (Cond.getOpcode() == X86ISD::PCMPEQ &&
48174 Cond.getOperand(0).getOpcode() == ISD::AND &&
48175 ISD::isBuildVectorAllZeros(Cond.getOperand(1).getNode()) &&
48176 isConstantPowerOf2(Cond.getOperand(0).getOperand(1),
48177 Cond.getScalarValueSizeInBits(),
48178 /*AllowUndefs=*/true) &&
48179 Cond.hasOneUse()) {
48180 Cond = DAG.getNode(X86ISD::PCMPEQ, DL, CondVT, Cond.getOperand(0),
48181 Cond.getOperand(0).getOperand(1));
48182 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48183 }
48184
48185 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
48186 // signbit.
48187 if (Cond.getOpcode() == X86ISD::PCMPGT &&
48188 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
48189 Cond.hasOneUse()) {
48190 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
48191 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
48192 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48193 }
48194 }
48195
48196 // Try to optimize vXi1 selects if both operands are either all constants or
48197 // bitcasts from scalar integer type. In that case we can convert the operands
48198 // to integer and use an integer select which will be converted to a CMOV.
48199 // We need to take a little bit of care to avoid creating an i64 type after
48200 // type legalization.
48201 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
48202 VT.getVectorElementType() == MVT::i1 &&
48203 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
48205 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
48206 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
48207 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
48208
48209 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
48210 LHS.getOperand(0).getValueType() == IntVT)) &&
48211 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
48212 RHS.getOperand(0).getValueType() == IntVT))) {
48213 if (LHSIsConst)
48215 else
48216 LHS = LHS.getOperand(0);
48217
48218 if (RHSIsConst)
48220 else
48221 RHS = RHS.getOperand(0);
48222
48223 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
48224 return DAG.getBitcast(VT, Select);
48225 }
48226 }
48227 }
48228
48229 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
48230 // single bits, then invert the predicate and swap the select operands.
48231 // This can lower using a vector shift bit-hack rather than mask and compare.
48232 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
48233 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
48234 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
48235 Cond.getOperand(0).getOpcode() == ISD::AND &&
48236 isNullOrNullSplat(Cond.getOperand(1)) &&
48237 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
48238 Cond.getOperand(0).getValueType() == VT) {
48239 // The 'and' mask must be composed of power-of-2 constants.
48240 SDValue And = Cond.getOperand(0);
48241 auto *C = isConstOrConstSplat(And.getOperand(1));
48242 if (C && C->getAPIntValue().isPowerOf2()) {
48243 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
48244 SDValue NotCond =
48245 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
48246 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
48247 }
48248
48249 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
48250 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
48251 // 16-bit lacks a proper blendv.
48252 unsigned EltBitWidth = VT.getScalarSizeInBits();
48253 bool CanShiftBlend =
48254 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
48255 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
48256 (Subtarget.hasXOP()));
48257 if (CanShiftBlend &&
48258 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
48259 return C->getAPIntValue().isPowerOf2();
48260 })) {
48261 // Create a left-shift constant to get the mask bits over to the sign-bit.
48262 SDValue Mask = And.getOperand(1);
48263 SmallVector<int, 32> ShlVals;
48264 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
48265 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
48266 ShlVals.push_back(EltBitWidth - 1 -
48267 MaskVal->getAPIntValue().exactLogBase2());
48268 }
48269 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
48270 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
48271 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
48272 SDValue NewCond =
48273 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
48274 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
48275 }
48276 }
48277
48278 return SDValue();
48279}
48280
48281/// Combine:
48282/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
48283/// to:
48284/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
48285/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
48286/// Note that this is only legal for some op/cc combinations.
48288 SelectionDAG &DAG,
48289 const X86Subtarget &Subtarget) {
48290 // This combine only operates on CMP-like nodes.
48291 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48292 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48293 return SDValue();
48294
48295 // Can't replace the cmp if it has more uses than the one we're looking at.
48296 // FIXME: We would like to be able to handle this, but would need to make sure
48297 // all uses were updated.
48298 if (!Cmp.hasOneUse())
48299 return SDValue();
48300
48301 // This only applies to variations of the common case:
48302 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
48303 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
48304 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
48305 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
48306 // Using the proper condcodes (see below), overflow is checked for.
48307
48308 // FIXME: We can generalize both constraints:
48309 // - XOR/OR/AND (if they were made to survive AtomicExpand)
48310 // - LHS != 1
48311 // if the result is compared.
48312
48313 SDValue CmpLHS = Cmp.getOperand(0);
48314 SDValue CmpRHS = Cmp.getOperand(1);
48315 EVT CmpVT = CmpLHS.getValueType();
48316
48317 if (!CmpLHS.hasOneUse())
48318 return SDValue();
48319
48320 unsigned Opc = CmpLHS.getOpcode();
48321 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
48322 return SDValue();
48323
48324 SDValue OpRHS = CmpLHS.getOperand(2);
48325 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
48326 if (!OpRHSC)
48327 return SDValue();
48328
48329 APInt Addend = OpRHSC->getAPIntValue();
48330 if (Opc == ISD::ATOMIC_LOAD_SUB)
48331 Addend = -Addend;
48332
48333 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
48334 if (!CmpRHSC)
48335 return SDValue();
48336
48337 APInt Comparison = CmpRHSC->getAPIntValue();
48338 APInt NegAddend = -Addend;
48339
48340 // See if we can adjust the CC to make the comparison match the negated
48341 // addend.
48342 if (Comparison != NegAddend) {
48343 APInt IncComparison = Comparison + 1;
48344 if (IncComparison == NegAddend) {
48345 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
48346 Comparison = IncComparison;
48347 CC = X86::COND_AE;
48348 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
48349 Comparison = IncComparison;
48350 CC = X86::COND_L;
48351 }
48352 }
48353 APInt DecComparison = Comparison - 1;
48354 if (DecComparison == NegAddend) {
48355 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
48356 Comparison = DecComparison;
48357 CC = X86::COND_A;
48358 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
48359 Comparison = DecComparison;
48360 CC = X86::COND_LE;
48361 }
48362 }
48363 }
48364
48365 // If the addend is the negation of the comparison value, then we can do
48366 // a full comparison by emitting the atomic arithmetic as a locked sub.
48367 if (Comparison == NegAddend) {
48368 // The CC is fine, but we need to rewrite the LHS of the comparison as an
48369 // atomic sub.
48370 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
48371 auto AtomicSub = DAG.getAtomic(
48372 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
48373 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
48374 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
48375 AN->getMemOperand());
48376 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
48377 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48378 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48379 return LockOp;
48380 }
48381
48382 // We can handle comparisons with zero in a number of cases by manipulating
48383 // the CC used.
48384 if (!Comparison.isZero())
48385 return SDValue();
48386
48387 if (CC == X86::COND_S && Addend == 1)
48388 CC = X86::COND_LE;
48389 else if (CC == X86::COND_NS && Addend == 1)
48390 CC = X86::COND_G;
48391 else if (CC == X86::COND_G && Addend == -1)
48392 CC = X86::COND_GE;
48393 else if (CC == X86::COND_LE && Addend == -1)
48394 CC = X86::COND_L;
48395 else
48396 return SDValue();
48397
48398 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
48399 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48400 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48401 return LockOp;
48402}
48403
48404// Check whether we're just testing the signbit, and whether we can simplify
48405// this by tracking where the signbit came from.
48407 SelectionDAG &DAG) {
48408 if (CC != X86::COND_S && CC != X86::COND_NS)
48409 return SDValue();
48410
48411 if (!Cmp.hasOneUse())
48412 return SDValue();
48413
48414 SDValue Src;
48415 if (Cmp.getOpcode() == X86ISD::CMP) {
48416 // CMP(X,0) -> signbit test
48417 if (!isNullConstant(Cmp.getOperand(1)))
48418 return SDValue();
48419 Src = Cmp.getOperand(0);
48420 // Peek through a SRA node as we just need the signbit.
48421 // TODO: Remove one use limit once sdiv-fix regressions are fixed.
48422 // TODO: Use SimplifyDemandedBits instead of just SRA?
48423 if (Src.getOpcode() != ISD::SRA || !Src.hasOneUse())
48424 return SDValue();
48425 Src = Src.getOperand(0);
48426 } else if (Cmp.getOpcode() == X86ISD::OR) {
48427 // OR(X,Y) -> see if only one operand contributes to the signbit.
48428 // TODO: XOR(X,Y) -> see if only one operand contributes to the signbit.
48429 if (DAG.SignBitIsZero(Cmp.getOperand(0)))
48430 Src = Cmp.getOperand(1);
48431 else if (DAG.SignBitIsZero(Cmp.getOperand(1)))
48432 Src = Cmp.getOperand(0);
48433 else
48434 return SDValue();
48435 } else {
48436 return SDValue();
48437 }
48438
48439 // Replace with a TEST on the MSB.
48440 SDLoc DL(Cmp);
48441 MVT SrcVT = Src.getSimpleValueType();
48442 APInt BitMask = APInt::getSignMask(SrcVT.getScalarSizeInBits());
48443
48444 // If Src came from a SIGN_EXTEND_INREG or SHL (probably from an expanded
48445 // SIGN_EXTEND_INREG), then peek through and adjust the TEST bit.
48446 if (Src.getOpcode() == ISD::SHL) {
48447 if (std::optional<unsigned> ShiftAmt = DAG.getValidShiftAmount(Src)) {
48448 Src = Src.getOperand(0);
48449 BitMask.lshrInPlace(*ShiftAmt);
48450 }
48451 } else if (Src.getOpcode() == ISD::SIGN_EXTEND_INREG) {
48452 EVT ExtVT = cast<VTSDNode>(Src.getOperand(1))->getVT();
48453 Src = Src.getOperand(0);
48454 BitMask.lshrInPlace(BitMask.getBitWidth() - ExtVT.getScalarSizeInBits());
48455 }
48456
48457 SDValue Mask = DAG.getNode(ISD::AND, DL, SrcVT, Src,
48458 DAG.getConstant(BitMask, DL, SrcVT));
48459 CC = CC == X86::COND_S ? X86::COND_NE : X86::COND_E;
48460 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Mask,
48461 DAG.getConstant(0, DL, SrcVT));
48462}
48463
48464// Check whether a boolean test is testing a boolean value generated by
48465// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
48466// code.
48467//
48468// Simplify the following patterns:
48469// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
48470// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
48471// to (Op EFLAGS Cond)
48472//
48473// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
48474// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
48475// to (Op EFLAGS !Cond)
48476//
48477// where Op could be BRCOND or CMOV.
48478//
48480 // This combine only operates on CMP-like nodes.
48481 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48482 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48483 return SDValue();
48484
48485 // Quit if not used as a boolean value.
48486 if (CC != X86::COND_E && CC != X86::COND_NE)
48487 return SDValue();
48488
48489 // Check CMP operands. One of them should be 0 or 1 and the other should be
48490 // an SetCC or extended from it.
48491 SDValue Op1 = Cmp.getOperand(0);
48492 SDValue Op2 = Cmp.getOperand(1);
48493
48494 SDValue SetCC;
48495 const ConstantSDNode* C = nullptr;
48496 bool needOppositeCond = (CC == X86::COND_E);
48497 bool checkAgainstTrue = false; // Is it a comparison against 1?
48498
48499 if ((C = dyn_cast<ConstantSDNode>(Op1)))
48500 SetCC = Op2;
48501 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
48502 SetCC = Op1;
48503 else // Quit if all operands are not constants.
48504 return SDValue();
48505
48506 if (C->getZExtValue() == 1) {
48507 needOppositeCond = !needOppositeCond;
48508 checkAgainstTrue = true;
48509 } else if (C->getZExtValue() != 0)
48510 // Quit if the constant is neither 0 or 1.
48511 return SDValue();
48512
48513 bool truncatedToBoolWithAnd = false;
48514 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
48515 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
48516 SetCC.getOpcode() == ISD::TRUNCATE ||
48517 SetCC.getOpcode() == ISD::AND) {
48518 if (SetCC.getOpcode() == ISD::AND) {
48519 int OpIdx = -1;
48520 if (isOneConstant(SetCC.getOperand(0)))
48521 OpIdx = 1;
48522 if (isOneConstant(SetCC.getOperand(1)))
48523 OpIdx = 0;
48524 if (OpIdx < 0)
48525 break;
48526 SetCC = SetCC.getOperand(OpIdx);
48527 truncatedToBoolWithAnd = true;
48528 } else
48529 SetCC = SetCC.getOperand(0);
48530 }
48531
48532 switch (SetCC.getOpcode()) {
48534 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
48535 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
48536 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
48537 // truncated to i1 using 'and'.
48538 if (checkAgainstTrue && !truncatedToBoolWithAnd)
48539 break;
48541 "Invalid use of SETCC_CARRY!");
48542 [[fallthrough]];
48543 case X86ISD::SETCC:
48544 // Set the condition code or opposite one if necessary.
48545 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
48546 if (needOppositeCond)
48548 return SetCC.getOperand(1);
48549 case X86ISD::CMOV: {
48550 // Check whether false/true value has canonical one, i.e. 0 or 1.
48553 // Quit if true value is not a constant.
48554 if (!TVal)
48555 return SDValue();
48556 // Quit if false value is not a constant.
48557 if (!FVal) {
48558 SDValue Op = SetCC.getOperand(0);
48559 // Skip 'zext' or 'trunc' node.
48560 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
48561 Op.getOpcode() == ISD::TRUNCATE)
48562 Op = Op.getOperand(0);
48563 // A special case for rdrand/rdseed, where 0 is set if false cond is
48564 // found.
48565 if ((Op.getOpcode() != X86ISD::RDRAND &&
48566 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
48567 return SDValue();
48568 }
48569 // Quit if false value is not the constant 0 or 1.
48570 bool FValIsFalse = true;
48571 if (FVal && FVal->getZExtValue() != 0) {
48572 if (FVal->getZExtValue() != 1)
48573 return SDValue();
48574 // If FVal is 1, opposite cond is needed.
48575 needOppositeCond = !needOppositeCond;
48576 FValIsFalse = false;
48577 }
48578 // Quit if TVal is not the constant opposite of FVal.
48579 if (FValIsFalse && TVal->getZExtValue() != 1)
48580 return SDValue();
48581 if (!FValIsFalse && TVal->getZExtValue() != 0)
48582 return SDValue();
48583 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
48584 if (needOppositeCond)
48586 return SetCC.getOperand(3);
48587 }
48588 }
48589
48590 return SDValue();
48591}
48592
48593/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
48594/// Match:
48595/// (X86or (X86setcc) (X86setcc))
48596/// (X86cmp (and (X86setcc) (X86setcc)), 0)
48598 X86::CondCode &CC1, SDValue &Flags,
48599 bool &isAnd) {
48600 if (Cond->getOpcode() == X86ISD::CMP) {
48601 if (!isNullConstant(Cond->getOperand(1)))
48602 return false;
48603
48604 Cond = Cond->getOperand(0);
48605 }
48606
48607 isAnd = false;
48608
48609 SDValue SetCC0, SetCC1;
48610 switch (Cond->getOpcode()) {
48611 default: return false;
48612 case ISD::AND:
48613 case X86ISD::AND:
48614 isAnd = true;
48615 [[fallthrough]];
48616 case ISD::OR:
48617 case X86ISD::OR:
48618 SetCC0 = Cond->getOperand(0);
48619 SetCC1 = Cond->getOperand(1);
48620 break;
48621 };
48622
48623 // Make sure we have SETCC nodes, using the same flags value.
48624 if (SetCC0.getOpcode() != X86ISD::SETCC ||
48625 SetCC1.getOpcode() != X86ISD::SETCC ||
48626 SetCC0->getOperand(1) != SetCC1->getOperand(1))
48627 return false;
48628
48629 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
48630 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
48631 Flags = SetCC0->getOperand(1);
48632 return true;
48633}
48634
48635// When legalizing carry, we create carries via add X, -1
48636// If that comes from an actual carry, via setcc, we use the
48637// carry directly.
48639 if (EFLAGS.getOpcode() == X86ISD::ADD) {
48640 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
48641 bool FoundAndLSB = false;
48642 SDValue Carry = EFLAGS.getOperand(0);
48643 while (Carry.getOpcode() == ISD::TRUNCATE ||
48644 Carry.getOpcode() == ISD::ZERO_EXTEND ||
48645 (Carry.getOpcode() == ISD::AND &&
48646 isOneConstant(Carry.getOperand(1)))) {
48647 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
48648 Carry = Carry.getOperand(0);
48649 }
48650 if (Carry.getOpcode() == X86ISD::SETCC ||
48651 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
48652 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
48653 uint64_t CarryCC = Carry.getConstantOperandVal(0);
48654 SDValue CarryOp1 = Carry.getOperand(1);
48655 if (CarryCC == X86::COND_B)
48656 return CarryOp1;
48657 if (CarryCC == X86::COND_A) {
48658 // Try to convert COND_A into COND_B in an attempt to facilitate
48659 // materializing "setb reg".
48660 //
48661 // Do not flip "e > c", where "c" is a constant, because Cmp
48662 // instruction cannot take an immediate as its first operand.
48663 //
48664 if (CarryOp1.getOpcode() == X86ISD::SUB &&
48665 CarryOp1.getNode()->hasOneUse() &&
48666 CarryOp1.getValueType().isInteger() &&
48667 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
48668 SDValue SubCommute =
48669 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
48670 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
48671 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
48672 }
48673 }
48674 // If this is a check of the z flag of an add with 1, switch to the
48675 // C flag.
48676 if (CarryCC == X86::COND_E &&
48677 CarryOp1.getOpcode() == X86ISD::ADD &&
48678 isOneConstant(CarryOp1.getOperand(1)))
48679 return CarryOp1;
48680 } else if (FoundAndLSB) {
48681 SDLoc DL(Carry);
48682 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
48683 if (Carry.getOpcode() == ISD::SRL) {
48684 BitNo = Carry.getOperand(1);
48685 Carry = Carry.getOperand(0);
48686 }
48687 return getBT(Carry, BitNo, DL, DAG);
48688 }
48689 }
48690 }
48691
48692 return SDValue();
48693}
48694
48695/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
48696/// to avoid the inversion.
48698 SelectionDAG &DAG,
48699 const X86Subtarget &Subtarget) {
48700 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
48701 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
48702 EFLAGS.getOpcode() != X86ISD::TESTP)
48703 return SDValue();
48704
48705 // PTEST/TESTP sets EFLAGS as:
48706 // TESTZ: ZF = (Op0 & Op1) == 0
48707 // TESTC: CF = (~Op0 & Op1) == 0
48708 // TESTNZC: ZF == 0 && CF == 0
48709 MVT VT = EFLAGS.getSimpleValueType();
48710 SDValue Op0 = EFLAGS.getOperand(0);
48711 SDValue Op1 = EFLAGS.getOperand(1);
48712 MVT OpVT = Op0.getSimpleValueType();
48713 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48714
48715 // TEST*(~X,Y) == TEST*(X,Y)
48716 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
48717 X86::CondCode InvCC;
48718 switch (CC) {
48719 case X86::COND_B:
48720 // testc -> testz.
48721 InvCC = X86::COND_E;
48722 break;
48723 case X86::COND_AE:
48724 // !testc -> !testz.
48725 InvCC = X86::COND_NE;
48726 break;
48727 case X86::COND_E:
48728 // testz -> testc.
48729 InvCC = X86::COND_B;
48730 break;
48731 case X86::COND_NE:
48732 // !testz -> !testc.
48733 InvCC = X86::COND_AE;
48734 break;
48735 case X86::COND_A:
48736 case X86::COND_BE:
48737 // testnzc -> testnzc (no change).
48738 InvCC = CC;
48739 break;
48740 default:
48741 InvCC = X86::COND_INVALID;
48742 break;
48743 }
48744
48745 if (InvCC != X86::COND_INVALID) {
48746 CC = InvCC;
48747 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48748 DAG.getBitcast(OpVT, NotOp0), Op1);
48749 }
48750 }
48751
48752 if (CC == X86::COND_B || CC == X86::COND_AE) {
48753 // TESTC(X,~X) == TESTC(X,-1)
48754 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48755 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
48756 SDLoc DL(EFLAGS);
48757 return DAG.getNode(
48758 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
48759 DAG.getBitcast(OpVT,
48760 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
48761 }
48762 }
48763 // PTESTC(PCMPEQ(X,0),-1) == PTESTZ(X,X)
48764 if (EFLAGS.getOpcode() == X86ISD::PTEST &&
48766 SDValue BC0 = peekThroughBitcasts(Op0);
48767 if (BC0.getOpcode() == X86ISD::PCMPEQ &&
48769 SDLoc DL(EFLAGS);
48770 CC = (CC == X86::COND_B ? X86::COND_E : X86::COND_NE);
48771 SDValue X = DAG.getBitcast(OpVT, BC0.getOperand(0));
48772 return DAG.getNode(EFLAGS.getOpcode(), DL, VT, X, X);
48773 }
48774 }
48775 }
48776
48777 if (CC == X86::COND_E || CC == X86::COND_NE) {
48778 // TESTZ(X,~Y) == TESTC(Y,X)
48779 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48780 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48781 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48782 DAG.getBitcast(OpVT, NotOp1), Op0);
48783 }
48784
48785 if (Op0 == Op1) {
48786 SDValue BC = peekThroughBitcasts(Op0);
48787 EVT BCVT = BC.getValueType();
48788
48789 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
48790 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
48791 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48792 DAG.getBitcast(OpVT, BC.getOperand(0)),
48793 DAG.getBitcast(OpVT, BC.getOperand(1)));
48794 }
48795
48796 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
48797 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
48798 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48799 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48800 DAG.getBitcast(OpVT, BC.getOperand(0)),
48801 DAG.getBitcast(OpVT, BC.getOperand(1)));
48802 }
48803
48804 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
48805 // to more efficiently extract the sign bits and compare that.
48806 // TODO: Handle TESTC with comparison inversion.
48807 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
48808 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
48809 if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) {
48810 unsigned EltBits = BCVT.getScalarSizeInBits();
48811 if (DAG.ComputeNumSignBits(BC) == EltBits) {
48812 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
48813 APInt SignMask = APInt::getSignMask(EltBits);
48814 if (SDValue Res =
48815 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
48816 // For vXi16 cases we need to use pmovmksb and extract every other
48817 // sign bit.
48818 SDLoc DL(EFLAGS);
48819 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
48820 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
48821 MVT FloatVT =
48822 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
48823 Res = DAG.getBitcast(FloatVT, Res);
48824 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
48825 } else if (EltBits == 16) {
48826 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
48827 Res = DAG.getBitcast(MovmskVT, Res);
48828 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48829 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
48830 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
48831 } else {
48832 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48833 }
48834 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
48835 DAG.getConstant(0, DL, MVT::i32));
48836 }
48837 }
48838 }
48839 }
48840
48841 // TESTZ(-1,X) == TESTZ(X,X)
48843 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
48844
48845 // TESTZ(X,-1) == TESTZ(X,X)
48847 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
48848
48849 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
48850 // TODO: Add COND_NE handling?
48851 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
48852 SDValue Src0 = peekThroughBitcasts(Op0);
48853 SDValue Src1 = peekThroughBitcasts(Op1);
48854 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
48856 peekThroughBitcasts(Src0.getOperand(1)), true);
48858 peekThroughBitcasts(Src1.getOperand(1)), true);
48859 if (Src0 && Src1) {
48860 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
48861 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48862 DAG.getBitcast(OpVT2, Src0),
48863 DAG.getBitcast(OpVT2, Src1));
48864 }
48865 }
48866 }
48867 }
48868
48869 return SDValue();
48870}
48871
48872// Attempt to simplify the MOVMSK input based on the comparison type.
48874 SelectionDAG &DAG,
48875 const X86Subtarget &Subtarget) {
48876 // Handle eq/ne against zero (any_of).
48877 // Handle eq/ne against -1 (all_of).
48878 if (!(CC == X86::COND_E || CC == X86::COND_NE))
48879 return SDValue();
48880 if (EFLAGS.getValueType() != MVT::i32)
48881 return SDValue();
48882 unsigned CmpOpcode = EFLAGS.getOpcode();
48883 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
48884 return SDValue();
48885 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
48886 if (!CmpConstant)
48887 return SDValue();
48888 const APInt &CmpVal = CmpConstant->getAPIntValue();
48889
48890 SDValue CmpOp = EFLAGS.getOperand(0);
48891 unsigned CmpBits = CmpOp.getValueSizeInBits();
48892 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
48893
48894 // Peek through any truncate.
48895 if (CmpOp.getOpcode() == ISD::TRUNCATE)
48896 CmpOp = CmpOp.getOperand(0);
48897
48898 // Bail if we don't find a MOVMSK.
48899 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
48900 return SDValue();
48901
48902 SDValue Vec = CmpOp.getOperand(0);
48903 MVT VecVT = Vec.getSimpleValueType();
48904 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
48905 "Unexpected MOVMSK operand");
48906 unsigned NumElts = VecVT.getVectorNumElements();
48907 unsigned NumEltBits = VecVT.getScalarSizeInBits();
48908
48909 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
48910 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
48911 NumElts <= CmpBits && CmpVal.isMask(NumElts);
48912 if (!IsAnyOf && !IsAllOf)
48913 return SDValue();
48914
48915 // TODO: Check more combining cases for me.
48916 // Here we check the cmp use number to decide do combining or not.
48917 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
48918 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
48919 bool IsOneUse = CmpOp.getNode()->hasOneUse();
48920
48921 // See if we can peek through to a vector with a wider element type, if the
48922 // signbits extend down to all the sub-elements as well.
48923 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
48924 // potential SimplifyDemandedBits/Elts cases.
48925 // If we looked through a truncate that discard bits, we can't do this
48926 // transform.
48927 // FIXME: We could do this transform for truncates that discarded bits by
48928 // inserting an AND mask between the new MOVMSK and the CMP.
48929 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
48930 SDValue BC = peekThroughBitcasts(Vec);
48931 MVT BCVT = BC.getSimpleValueType();
48932 unsigned BCNumElts = BCVT.getVectorNumElements();
48933 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
48934 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
48935 BCNumEltBits > NumEltBits &&
48936 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
48937 SDLoc DL(EFLAGS);
48938 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
48939 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48940 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
48941 DAG.getConstant(CmpMask, DL, MVT::i32));
48942 }
48943 }
48944
48945 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
48946 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
48947 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
48948 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
48949 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
48951 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
48952 Ops.size() == 2) {
48953 SDLoc DL(EFLAGS);
48954 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
48955 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
48956 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
48957 DAG.getBitcast(SubVT, Ops[0]),
48958 DAG.getBitcast(SubVT, Ops[1]));
48959 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
48960 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48961 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
48962 DAG.getConstant(CmpMask, DL, MVT::i32));
48963 }
48964 }
48965
48966 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
48967 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
48968 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
48969 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
48970 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
48971 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
48972 SDValue BC = peekThroughBitcasts(Vec);
48973 // Ensure MOVMSK was testing every signbit of BC.
48974 if (BC.getValueType().getVectorNumElements() <= NumElts) {
48975 if (BC.getOpcode() == X86ISD::PCMPEQ) {
48976 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
48977 BC.getOperand(0), BC.getOperand(1));
48978 V = DAG.getBitcast(TestVT, V);
48979 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48980 }
48981 // Check for 256-bit split vector cases.
48982 if (BC.getOpcode() == ISD::AND &&
48983 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
48984 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
48985 SDValue LHS = BC.getOperand(0);
48986 SDValue RHS = BC.getOperand(1);
48987 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
48988 LHS.getOperand(0), LHS.getOperand(1));
48989 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
48990 RHS.getOperand(0), RHS.getOperand(1));
48991 LHS = DAG.getBitcast(TestVT, LHS);
48992 RHS = DAG.getBitcast(TestVT, RHS);
48993 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
48994 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48995 }
48996 }
48997 }
48998
48999 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
49000 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
49001 // sign bits prior to the comparison with zero unless we know that
49002 // the vXi16 splats the sign bit down to the lower i8 half.
49003 // TODO: Handle all_of patterns.
49004 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
49005 SDValue VecOp0 = Vec.getOperand(0);
49006 SDValue VecOp1 = Vec.getOperand(1);
49007 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
49008 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
49009 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
49010 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
49011 SDLoc DL(EFLAGS);
49012 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
49013 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49014 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
49015 if (!SignExt0) {
49016 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
49017 DAG.getConstant(0xAAAA, DL, MVT::i16));
49018 }
49019 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
49020 DAG.getConstant(0, DL, MVT::i16));
49021 }
49022 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
49023 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
49024 if (CmpBits >= 16 && Subtarget.hasInt256() &&
49025 (IsAnyOf || (SignExt0 && SignExt1))) {
49026 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
49027 SDLoc DL(EFLAGS);
49028 SDValue Result = peekThroughBitcasts(Src);
49029 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
49030 Result.getValueType().getVectorNumElements() <= NumElts) {
49031 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
49032 Result.getOperand(0), Result.getOperand(1));
49033 V = DAG.getBitcast(MVT::v4i64, V);
49034 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
49035 }
49036 Result = DAG.getBitcast(MVT::v32i8, Result);
49037 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49038 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
49039 if (!SignExt0 || !SignExt1) {
49040 assert(IsAnyOf &&
49041 "Only perform v16i16 signmasks for any_of patterns");
49042 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
49043 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
49044 }
49045 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
49046 DAG.getConstant(CmpMask, DL, MVT::i32));
49047 }
49048 }
49049 }
49050
49051 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
49052 // Since we peek through a bitcast, we need to be careful if the base vector
49053 // type has smaller elements than the MOVMSK type. In that case, even if
49054 // all the elements are demanded by the shuffle mask, only the "high"
49055 // elements which have highbits that align with highbits in the MOVMSK vec
49056 // elements are actually demanded. A simplification of spurious operations
49057 // on the "low" elements take place during other simplifications.
49058 //
49059 // For example:
49060 // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
49061 // demanded, because we are swapping around the result can change.
49062 //
49063 // To address this, we check that we can scale the shuffle mask to MOVMSK
49064 // element width (this will ensure "high" elements match). Its slightly overly
49065 // conservative, but fine for an edge case fold.
49066 SmallVector<int, 32> ShuffleMask;
49067 SmallVector<SDValue, 2> ShuffleInputs;
49068 if (NumElts <= CmpBits &&
49069 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
49070 ShuffleMask, DAG) &&
49071 ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) &&
49072 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
49073 canScaleShuffleElements(ShuffleMask, NumElts)) {
49074 SDLoc DL(EFLAGS);
49075 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
49076 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49077 Result =
49078 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
49079 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, EFLAGS.getOperand(1));
49080 }
49081
49082 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
49083 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
49084 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
49085 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
49086 // iff every element is referenced.
49087 if (NumElts <= CmpBits && Subtarget.hasAVX() &&
49088 !Subtarget.preferMovmskOverVTest() && IsOneUse &&
49089 (NumEltBits == 32 || NumEltBits == 64)) {
49090 SDLoc DL(EFLAGS);
49091 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
49092 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
49093 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
49094 SDValue LHS = Vec;
49095 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
49096 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
49097 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
49098 DAG.getBitcast(FloatVT, LHS),
49099 DAG.getBitcast(FloatVT, RHS));
49100 }
49101
49102 return SDValue();
49103}
49104
49105/// Optimize an EFLAGS definition used according to the condition code \p CC
49106/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
49107/// uses of chain values.
49109 SelectionDAG &DAG,
49110 const X86Subtarget &Subtarget) {
49111 if (CC == X86::COND_B)
49112 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
49113 return Flags;
49114
49115 if (SDValue R = checkSignTestSetCCCombine(EFLAGS, CC, DAG))
49116 return R;
49117
49118 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
49119 return R;
49120
49121 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
49122 return R;
49123
49124 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
49125 return R;
49126
49127 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
49128}
49129
49130/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
49133 const X86Subtarget &Subtarget) {
49134 SDLoc DL(N);
49135 EVT VT = N->getValueType(0);
49136 SDValue FalseOp = N->getOperand(0);
49137 SDValue TrueOp = N->getOperand(1);
49138 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
49139 SDValue Cond = N->getOperand(3);
49140
49141 // cmov X, X, ?, ? --> X
49142 if (TrueOp == FalseOp)
49143 return TrueOp;
49144
49145 // Try to simplify the EFLAGS and condition code operands.
49146 // We can't always do this as FCMOV only supports a subset of X86 cond.
49147 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
49148 if (!(FalseOp.getValueType() == MVT::f80 ||
49149 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
49150 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
49151 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
49152 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
49153 Flags};
49154 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49155 }
49156 }
49157
49158 // If this is a select between two integer constants, try to do some
49159 // optimizations. Note that the operands are ordered the opposite of SELECT
49160 // operands.
49161 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
49162 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
49163 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
49164 // larger than FalseC (the false value).
49165 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
49167 std::swap(TrueC, FalseC);
49168 std::swap(TrueOp, FalseOp);
49169 }
49170
49171 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
49172 // This is efficient for any integer data type (including i8/i16) and
49173 // shift amount.
49174 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
49175 Cond = getSETCC(CC, Cond, DL, DAG);
49176
49177 // Zero extend the condition if needed.
49178 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
49179
49180 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
49181 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
49182 DAG.getConstant(ShAmt, DL, MVT::i8));
49183 return Cond;
49184 }
49185
49186 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
49187 // for any integer data type, including i8/i16.
49188 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
49189 Cond = getSETCC(CC, Cond, DL, DAG);
49190
49191 // Zero extend the condition if needed.
49193 FalseC->getValueType(0), Cond);
49194 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49195 SDValue(FalseC, 0));
49196 return Cond;
49197 }
49198
49199 // Optimize cases that will turn into an LEA instruction. This requires
49200 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
49201 if (VT == MVT::i32 || VT == MVT::i64) {
49202 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
49203 assert(Diff.getBitWidth() == VT.getSizeInBits() &&
49204 "Implicit constant truncation");
49205
49206 bool isFastMultiplier = false;
49207 if (Diff.ult(10)) {
49208 switch (Diff.getZExtValue()) {
49209 default: break;
49210 case 1: // result = add base, cond
49211 case 2: // result = lea base( , cond*2)
49212 case 3: // result = lea base(cond, cond*2)
49213 case 4: // result = lea base( , cond*4)
49214 case 5: // result = lea base(cond, cond*4)
49215 case 8: // result = lea base( , cond*8)
49216 case 9: // result = lea base(cond, cond*8)
49217 isFastMultiplier = true;
49218 break;
49219 }
49220 }
49221
49222 if (isFastMultiplier) {
49223 Cond = getSETCC(CC, Cond, DL ,DAG);
49224 // Zero extend the condition if needed.
49225 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
49226 Cond);
49227 // Scale the condition by the difference.
49228 if (Diff != 1)
49229 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
49230 DAG.getConstant(Diff, DL, Cond.getValueType()));
49231
49232 // Add the base if non-zero.
49233 if (FalseC->getAPIntValue() != 0)
49234 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49235 SDValue(FalseC, 0));
49236 return Cond;
49237 }
49238 }
49239 }
49240 }
49241
49242 // Handle these cases:
49243 // (select (x != c), e, c) -> select (x != c), e, x),
49244 // (select (x == c), c, e) -> select (x == c), x, e)
49245 // where the c is an integer constant, and the "select" is the combination
49246 // of CMOV and CMP.
49247 //
49248 // The rationale for this change is that the conditional-move from a constant
49249 // needs two instructions, however, conditional-move from a register needs
49250 // only one instruction.
49251 //
49252 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
49253 // some instruction-combining opportunities. This opt needs to be
49254 // postponed as late as possible.
49255 //
49256 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
49257 // the DCI.xxxx conditions are provided to postpone the optimization as
49258 // late as possible.
49259
49260 ConstantSDNode *CmpAgainst = nullptr;
49261 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
49262 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
49263 !isa<ConstantSDNode>(Cond.getOperand(0))) {
49264
49265 if (CC == X86::COND_NE &&
49266 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
49268 std::swap(TrueOp, FalseOp);
49269 }
49270
49271 if (CC == X86::COND_E && CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
49272 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
49273 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
49274 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49275 }
49276 }
49277 }
49278
49279 // Transform:
49280 //
49281 // (cmov 1 T (uge T 2))
49282 //
49283 // to:
49284 //
49285 // (adc T 0 (sub T 1))
49286 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
49287 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
49288 SDValue Cond0 = Cond.getOperand(0);
49289 if (Cond0.getOpcode() == ISD::TRUNCATE)
49290 Cond0 = Cond0.getOperand(0);
49291 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
49292 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
49293 EVT CondVT = Cond->getValueType(0);
49294 // Subtract 1 and generate a carry.
49295 SDValue NewSub =
49296 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
49297 DAG.getConstant(1, DL, CondVT));
49298 SDValue EFLAGS(NewSub.getNode(), 1);
49299 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(VT, MVT::i32), TrueOp,
49300 DAG.getConstant(0, DL, VT), EFLAGS);
49301 }
49302 }
49303
49304 // Fold and/or of setcc's to double CMOV:
49305 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
49306 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
49307 //
49308 // This combine lets us generate:
49309 // cmovcc1 (jcc1 if we don't have CMOV)
49310 // cmovcc2 (same)
49311 // instead of:
49312 // setcc1
49313 // setcc2
49314 // and/or
49315 // cmovne (jne if we don't have CMOV)
49316 // When we can't use the CMOV instruction, it might increase branch
49317 // mispredicts.
49318 // When we can use CMOV, or when there is no mispredict, this improves
49319 // throughput and reduces register pressure.
49320 //
49321 if (CC == X86::COND_NE) {
49322 SDValue Flags;
49323 X86::CondCode CC0, CC1;
49324 bool isAndSetCC;
49325 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
49326 if (isAndSetCC) {
49327 std::swap(FalseOp, TrueOp);
49330 }
49331
49332 SDValue LOps[] = {FalseOp, TrueOp,
49333 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
49334 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, VT, LOps);
49335 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
49336 Flags};
49337 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49338 return CMOV;
49339 }
49340 }
49341
49342 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
49343 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
49344 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
49345 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
49346 // Or (CMOV (BSR ?, X), Y, (X == 0)) -> (BSR Y, X)
49347 // TODO: Or (CMOV (BSF ?, X), Y, (X == 0)) -> (BSF Y, X)
49348 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
49349 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
49350 SDValue Add = TrueOp;
49351 SDValue Const = FalseOp;
49352 // Canonicalize the condition code for easier matching and output.
49353 if (CC == X86::COND_E)
49354 std::swap(Add, Const);
49355
49356 // TODO: ADD BSF support, but requires changes to the "REP BSF" CTTZ hack.
49357 if (Subtarget.hasBitScanPassThrough() && Add.getOpcode() == X86ISD::BSR &&
49358 Add.getResNo() == 0 && Add.hasOneUse() &&
49359 Add.getOperand(1) == Cond.getOperand(0)) {
49360 return DAG.getNode(Add.getOpcode(), DL, Add->getVTList(), Const,
49361 Add.getOperand(1));
49362 }
49363
49364 // We might have replaced the constant in the cmov with the LHS of the
49365 // compare. If so change it to the RHS of the compare.
49366 if (Const == Cond.getOperand(0))
49367 Const = Cond.getOperand(1);
49368
49369 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
49370 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
49371 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
49372 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
49373 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
49374 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
49375 // This should constant fold.
49376 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
49377 SDValue CMov =
49378 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
49379 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
49380 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
49381 }
49382 }
49383
49384 return SDValue();
49385}
49386
49387/// Different mul shrinking modes.
49389
49391 EVT VT = N->getOperand(0).getValueType();
49392 if (VT.getScalarSizeInBits() != 32)
49393 return false;
49394
49395 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
49396 unsigned SignBits[2] = {1, 1};
49397 bool IsPositive[2] = {false, false};
49398 for (unsigned i = 0; i < 2; i++) {
49399 SDValue Opd = N->getOperand(i);
49400
49401 SignBits[i] = DAG.ComputeNumSignBits(Opd);
49402 IsPositive[i] = DAG.SignBitIsZero(Opd);
49403 }
49404
49405 bool AllPositive = IsPositive[0] && IsPositive[1];
49406 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
49407 // When ranges are from -128 ~ 127, use MULS8 mode.
49408 if (MinSignBits >= 25)
49410 // When ranges are from 0 ~ 255, use MULU8 mode.
49411 else if (AllPositive && MinSignBits >= 24)
49413 // When ranges are from -32768 ~ 32767, use MULS16 mode.
49414 else if (MinSignBits >= 17)
49416 // When ranges are from 0 ~ 65535, use MULU16 mode.
49417 else if (AllPositive && MinSignBits >= 16)
49419 else
49420 return false;
49421 return true;
49422}
49423
49424/// When the operands of vector mul are extended from smaller size values,
49425/// like i8 and i16, the type of mul may be shrinked to generate more
49426/// efficient code. Two typical patterns are handled:
49427/// Pattern1:
49428/// %2 = sext/zext <N x i8> %1 to <N x i32>
49429/// %4 = sext/zext <N x i8> %3 to <N x i32>
49430// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49431/// %5 = mul <N x i32> %2, %4
49432///
49433/// Pattern2:
49434/// %2 = zext/sext <N x i16> %1 to <N x i32>
49435/// %4 = zext/sext <N x i16> %3 to <N x i32>
49436/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49437/// %5 = mul <N x i32> %2, %4
49438///
49439/// There are four mul shrinking modes:
49440/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
49441/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
49442/// generate pmullw+sext32 for it (MULS8 mode).
49443/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
49444/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
49445/// generate pmullw+zext32 for it (MULU8 mode).
49446/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
49447/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
49448/// generate pmullw+pmulhw for it (MULS16 mode).
49449/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
49450/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
49451/// generate pmullw+pmulhuw for it (MULU16 mode).
49453 const X86Subtarget &Subtarget) {
49454 // Check for legality
49455 // pmullw/pmulhw are not supported by SSE.
49456 if (!Subtarget.hasSSE2())
49457 return SDValue();
49458
49459 // Check for profitability
49460 // pmulld is supported since SSE41. It is better to use pmulld
49461 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
49462 // the expansion.
49463 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
49464 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
49465 return SDValue();
49466
49468 if (!canReduceVMulWidth(N, DAG, Mode))
49469 return SDValue();
49470
49471 SDValue N0 = N->getOperand(0);
49472 SDValue N1 = N->getOperand(1);
49473 EVT VT = N->getOperand(0).getValueType();
49474 unsigned NumElts = VT.getVectorNumElements();
49475 if ((NumElts % 2) != 0)
49476 return SDValue();
49477
49478 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
49479
49480 // Shrink the operands of mul.
49481 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
49482 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
49483
49484 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
49485 // lower part is needed.
49486 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
49490 DL, VT, MulLo);
49491
49492 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
49493 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
49494 // the higher part is also needed.
49495 SDValue MulHi =
49497 ReducedVT, NewN0, NewN1);
49498
49499 // Repack the lower part and higher part result of mul into a wider
49500 // result.
49501 // Generate shuffle functioning as punpcklwd.
49502 SmallVector<int, 16> ShuffleMask(NumElts);
49503 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49504 ShuffleMask[2 * i] = i;
49505 ShuffleMask[2 * i + 1] = i + NumElts;
49506 }
49507 SDValue ResLo =
49508 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49509 ResLo = DAG.getBitcast(ResVT, ResLo);
49510 // Generate shuffle functioning as punpckhwd.
49511 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49512 ShuffleMask[2 * i] = i + NumElts / 2;
49513 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
49514 }
49515 SDValue ResHi =
49516 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49517 ResHi = DAG.getBitcast(ResVT, ResHi);
49518 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
49519}
49520
49522 EVT VT, const SDLoc &DL) {
49523
49524 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
49525 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49526 DAG.getConstant(Mult, DL, VT));
49527 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
49528 DAG.getConstant(Shift, DL, MVT::i8));
49529 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49530 N->getOperand(0));
49531 return Result;
49532 };
49533
49534 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
49535 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49536 DAG.getConstant(Mul1, DL, VT));
49537 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
49538 DAG.getConstant(Mul2, DL, VT));
49539 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49540 N->getOperand(0));
49541 return Result;
49542 };
49543
49544 switch (MulAmt) {
49545 default:
49546 break;
49547 case 11:
49548 // mul x, 11 => add ((shl (mul x, 5), 1), x)
49549 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
49550 case 21:
49551 // mul x, 21 => add ((shl (mul x, 5), 2), x)
49552 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
49553 case 41:
49554 // mul x, 41 => add ((shl (mul x, 5), 3), x)
49555 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
49556 case 22:
49557 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
49558 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49559 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
49560 case 19:
49561 // mul x, 19 => add ((shl (mul x, 9), 1), x)
49562 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
49563 case 37:
49564 // mul x, 37 => add ((shl (mul x, 9), 2), x)
49565 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
49566 case 73:
49567 // mul x, 73 => add ((shl (mul x, 9), 3), x)
49568 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
49569 case 13:
49570 // mul x, 13 => add ((shl (mul x, 3), 2), x)
49571 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
49572 case 23:
49573 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
49574 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
49575 case 26:
49576 // mul x, 26 => add ((mul (mul x, 5), 5), x)
49577 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
49578 case 28:
49579 // mul x, 28 => add ((mul (mul x, 9), 3), x)
49580 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
49581 case 29:
49582 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
49583 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49584 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
49585 }
49586
49587 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
49588 // by a single LEA.
49589 // First check if this a sum of two power of 2s because that's easy. Then
49590 // count how many zeros are up to the first bit.
49591 // TODO: We can do this even without LEA at a cost of two shifts and an add.
49592 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
49593 unsigned ScaleShift = llvm::countr_zero(MulAmt);
49594 if (ScaleShift >= 1 && ScaleShift < 4) {
49595 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
49596 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49597 DAG.getConstant(ShiftAmt, DL, MVT::i8));
49598 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49599 DAG.getConstant(ScaleShift, DL, MVT::i8));
49600 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
49601 }
49602 }
49603
49604 return SDValue();
49605}
49606
49607// If the upper 17 bits of either element are zero and the other element are
49608// zero/sign bits then we can use PMADDWD, which is always at least as quick as
49609// PMULLD, except on KNL.
49611 SelectionDAG &DAG,
49612 const X86Subtarget &Subtarget) {
49613 if (!Subtarget.hasSSE2())
49614 return SDValue();
49615
49616 if (Subtarget.isPMADDWDSlow())
49617 return SDValue();
49618
49619 EVT VT = N->getValueType(0);
49620
49621 // Only support vXi32 vectors.
49622 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
49623 return SDValue();
49624
49625 // Make sure the type is legal or can split/widen to a legal type.
49626 // With AVX512 but without BWI, we would need to split v32i16.
49627 unsigned NumElts = VT.getVectorNumElements();
49628 if (NumElts == 1 || !isPowerOf2_32(NumElts))
49629 return SDValue();
49630
49631 // With AVX512 but without BWI, we would need to split v32i16.
49632 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
49633 return SDValue();
49634
49635 SDValue N0 = N->getOperand(0);
49636 SDValue N1 = N->getOperand(1);
49637
49638 // If we are zero/sign extending two steps without SSE4.1, its better to
49639 // reduce the vmul width instead.
49640 if (!Subtarget.hasSSE41() &&
49641 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
49642 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49643 (N1.getOpcode() == ISD::ZERO_EXTEND &&
49644 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
49645 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
49646 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49647 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49648 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
49649 return SDValue();
49650
49651 // If we are sign extending a wide vector without SSE4.1, its better to reduce
49652 // the vmul width instead.
49653 if (!Subtarget.hasSSE41() &&
49654 (N0.getOpcode() == ISD::SIGN_EXTEND &&
49655 N0.getOperand(0).getValueSizeInBits() > 128) &&
49656 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49657 N1.getOperand(0).getValueSizeInBits() > 128))
49658 return SDValue();
49659
49660 // Sign bits must extend down to the lowest i16.
49661 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
49662 DAG.ComputeMaxSignificantBits(N0) > 16)
49663 return SDValue();
49664
49665 // At least one of the elements must be zero in the upper 17 bits, or can be
49666 // safely made zero without altering the final result.
49667 auto GetZeroableOp = [&](SDValue Op) {
49668 APInt Mask17 = APInt::getHighBitsSet(32, 17);
49669 if (DAG.MaskedValueIsZero(Op, Mask17))
49670 return Op;
49671 // Mask off upper 16-bits of sign-extended constants.
49673 return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(0xFFFF, DL, VT));
49674 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
49675 SDValue Src = Op.getOperand(0);
49676 // Convert sext(vXi16) to zext(vXi16).
49677 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
49678 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49679 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
49680 // which will expand the extension.
49681 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
49682 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
49683 Src = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, Src);
49684 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49685 }
49686 }
49687 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
49688 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
49689 N->isOnlyUserOf(Op.getNode())) {
49690 SDValue Src = Op.getOperand(0);
49691 if (Src.getScalarValueSizeInBits() == 16)
49692 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Src);
49693 }
49694 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
49695 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
49696 N->isOnlyUserOf(Op.getNode())) {
49697 return DAG.getNode(X86ISD::VSRLI, DL, VT, Op.getOperand(0),
49698 Op.getOperand(1));
49699 }
49700 return SDValue();
49701 };
49702 SDValue ZeroN0 = GetZeroableOp(N0);
49703 SDValue ZeroN1 = GetZeroableOp(N1);
49704 if (!ZeroN0 && !ZeroN1)
49705 return SDValue();
49706 N0 = ZeroN0 ? ZeroN0 : N0;
49707 N1 = ZeroN1 ? ZeroN1 : N1;
49708
49709 // Use SplitOpsAndApply to handle AVX splitting.
49710 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49712 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
49713 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
49714 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
49715 DAG.getBitcast(OpVT, Ops[0]),
49716 DAG.getBitcast(OpVT, Ops[1]));
49717 };
49718 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDWDBuilder);
49719}
49720
49722 const X86Subtarget &Subtarget) {
49723 if (!Subtarget.hasSSE2())
49724 return SDValue();
49725
49726 EVT VT = N->getValueType(0);
49727
49728 // Only support vXi64 vectors.
49729 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
49730 VT.getVectorNumElements() < 2 ||
49732 return SDValue();
49733
49734 SDValue N0 = N->getOperand(0);
49735 SDValue N1 = N->getOperand(1);
49736
49737 // MULDQ returns the 64-bit result of the signed multiplication of the lower
49738 // 32-bits. We can lower with this if the sign bits stretch that far.
49739 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
49740 DAG.ComputeNumSignBits(N1) > 32) {
49741 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49743 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
49744 };
49745 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULDQBuilder,
49746 /*CheckBWI*/ false);
49747 }
49748
49749 // If the upper bits are zero we can use a single pmuludq.
49750 APInt Mask = APInt::getHighBitsSet(64, 32);
49751 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
49752 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49754 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
49755 };
49756 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULUDQBuilder,
49757 /*CheckBWI*/ false);
49758 }
49759
49760 return SDValue();
49761}
49762
49765 const X86Subtarget &Subtarget) {
49766 EVT VT = N->getValueType(0);
49767 SDLoc DL(N);
49768
49769 if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget))
49770 return V;
49771
49772 if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))
49773 return V;
49774
49775 if (DCI.isBeforeLegalize() && VT.isVector())
49776 return reduceVMULWidth(N, DL, DAG, Subtarget);
49777
49778 if (VT != MVT::i64 && VT != MVT::i32 &&
49779 (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
49780 return SDValue();
49781
49782 KnownBits Known1 = DAG.computeKnownBits(N->getOperand(1));
49783 if (!Known1.isConstant())
49784 return SDValue();
49785
49786 const APInt &C = Known1.getConstant();
49787 if (C.isZero())
49788 return DAG.getConstant(0, DL, VT);
49789
49790 if (C.isAllOnes())
49791 return DAG.getNegative(N->getOperand(0), DL, VT);
49792
49793 if (isPowerOf2_64(C.getZExtValue()))
49794 return SDValue();
49795
49796 // Optimize a single multiply with constant into two operations in order to
49797 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
49799 return SDValue();
49800
49801 // An imul is usually smaller than the alternative sequence.
49803 return SDValue();
49804
49805 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
49806 return SDValue();
49807
49808 int64_t SignMulAmt = C.getSExtValue();
49809 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
49810 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
49811
49812 SDValue NewMul = SDValue();
49813 if (VT == MVT::i64 || VT == MVT::i32) {
49814 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
49815 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49816 DAG.getConstant(AbsMulAmt, DL, VT));
49817 if (SignMulAmt < 0)
49818 NewMul = DAG.getNegative(NewMul, DL, VT);
49819
49820 return NewMul;
49821 }
49822
49823 uint64_t MulAmt1 = 0;
49824 uint64_t MulAmt2 = 0;
49825 if ((AbsMulAmt % 9) == 0) {
49826 MulAmt1 = 9;
49827 MulAmt2 = AbsMulAmt / 9;
49828 } else if ((AbsMulAmt % 5) == 0) {
49829 MulAmt1 = 5;
49830 MulAmt2 = AbsMulAmt / 5;
49831 } else if ((AbsMulAmt % 3) == 0) {
49832 MulAmt1 = 3;
49833 MulAmt2 = AbsMulAmt / 3;
49834 }
49835
49836 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
49837 if (MulAmt2 &&
49838 (isPowerOf2_64(MulAmt2) ||
49839 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
49840
49841 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
49842 N->user_begin()->getOpcode() == ISD::ADD))
49843 // If second multiplifer is pow2, issue it first. We want the multiply
49844 // by 3, 5, or 9 to be folded into the addressing mode unless the lone
49845 // use is an add. Only do this for positive multiply amounts since the
49846 // negate would prevent it from being used as an address mode anyway.
49847 std::swap(MulAmt1, MulAmt2);
49848
49849 if (isPowerOf2_64(MulAmt1))
49850 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49851 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
49852 else
49853 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49854 DAG.getConstant(MulAmt1, DL, VT));
49855
49856 if (isPowerOf2_64(MulAmt2))
49857 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
49858 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
49859 else
49860 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
49861 DAG.getConstant(MulAmt2, DL, VT));
49862
49863 // Negate the result.
49864 if (SignMulAmt < 0)
49865 NewMul = DAG.getNegative(NewMul, DL, VT);
49866 } else if (!Subtarget.slowLEA())
49867 NewMul = combineMulSpecial(C.getZExtValue(), N, DAG, VT, DL);
49868 }
49869 if (!NewMul) {
49870 EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
49871 if (isPowerOf2_64(AbsMulAmt - 1)) {
49872 // (mul x, 2^N + 1) => (add (shl x, N), x)
49873 NewMul = DAG.getNode(
49874 ISD::ADD, DL, VT, N->getOperand(0),
49875 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49876 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
49877 if (SignMulAmt < 0)
49878 NewMul = DAG.getNegative(NewMul, DL, VT);
49879 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
49880 // (mul x, 2^N - 1) => (sub (shl x, N), x)
49881 NewMul =
49882 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49883 DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
49884 // To negate, reverse the operands of the subtract.
49885 if (SignMulAmt < 0)
49886 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
49887 else
49888 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
49889 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
49890 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49891 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
49892 NewMul =
49893 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49894 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
49895 NewMul = DAG.getNode(
49896 ISD::ADD, DL, VT, NewMul,
49897 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49898 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
49899 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49900 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
49901 NewMul =
49902 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49903 DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
49904 NewMul = DAG.getNode(
49905 ISD::SUB, DL, VT, NewMul,
49906 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49907 } else if (SignMulAmt >= 0 && VT.isVector() &&
49908 Subtarget.fastImmVectorShift()) {
49909 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
49910 uint64_t ShiftAmt1;
49911 std::optional<unsigned> Opc;
49912 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
49913 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
49914 Opc = ISD::ADD;
49915 } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
49916 ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
49917 Opc = ISD::SUB;
49918 }
49919
49920 if (Opc) {
49921 SDValue Shift1 =
49922 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49923 DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
49924 SDValue Shift2 =
49925 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49926 DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
49927 NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
49928 }
49929 }
49930 }
49931
49932 return NewMul;
49933}
49934
49935// Try to form a MULHU or MULHS node by looking for
49936// (srl (mul ext, ext), 16)
49937// TODO: This is X86 specific because we want to be able to handle wide types
49938// before type legalization. But we can only do it if the vector will be
49939// legalized via widening/splitting. Type legalization can't handle promotion
49940// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
49941// combiner.
49943 const SDLoc &DL,
49944 const X86Subtarget &Subtarget) {
49945 using namespace SDPatternMatch;
49946 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
49947 "SRL or SRA node is required here!");
49948
49949 if (!Subtarget.hasSSE2())
49950 return SDValue();
49951
49952 // Input type should be at least vXi32.
49953 EVT VT = N->getValueType(0);
49954 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
49955 return SDValue();
49956
49957 // The operation must be a multiply shifted right by 16.
49958 SDValue LHS, RHS;
49959 if (!sd_match(N->getOperand(1), m_SpecificInt(16)) ||
49960 !sd_match(N->getOperand(0), m_OneUse(m_Mul(m_Value(LHS), m_Value(RHS)))))
49961 return SDValue();
49962
49963 unsigned ExtOpc = LHS.getOpcode();
49964 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
49965 RHS.getOpcode() != ExtOpc)
49966 return SDValue();
49967
49968 // Peek through the extends.
49969 LHS = LHS.getOperand(0);
49970 RHS = RHS.getOperand(0);
49971
49972 // Ensure the input types match.
49973 EVT MulVT = LHS.getValueType();
49974 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
49975 return SDValue();
49976
49977 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
49978 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
49979
49980 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
49981 return DAG.getNode(ExtOpc, DL, VT, Mulh);
49982}
49983
49985 const X86Subtarget &Subtarget) {
49986 using namespace llvm::SDPatternMatch;
49987 SDValue N0 = N->getOperand(0);
49988 SDValue N1 = N->getOperand(1);
49990 EVT VT = N0.getValueType();
49991 unsigned EltSizeInBits = VT.getScalarSizeInBits();
49992 SDLoc DL(N);
49993
49994 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
49995 // with out-of-bounds clamping.
49996 if (N0.getOpcode() == ISD::VSELECT &&
49997 supportedVectorVarShift(VT, Subtarget, ISD::SHL)) {
49998 SDValue Cond = N0.getOperand(0);
49999 SDValue N00 = N0.getOperand(1);
50000 SDValue N01 = N0.getOperand(2);
50001 // fold shl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psllv(x,amt)
50003 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50005 return DAG.getNode(X86ISD::VSHLV, DL, VT, N00, N1);
50006 }
50007 // fold shl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psllv(x,amt)
50009 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50011 return DAG.getNode(X86ISD::VSHLV, DL, VT, N01, N1);
50012 }
50013 }
50014
50015 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
50016 // since the result of setcc_c is all zero's or all ones.
50017 if (VT.isInteger() && !VT.isVector() &&
50018 N1C && N0.getOpcode() == ISD::AND &&
50019 N0.getOperand(1).getOpcode() == ISD::Constant) {
50020 SDValue N00 = N0.getOperand(0);
50021 APInt Mask = N0.getConstantOperandAPInt(1);
50022 Mask <<= N1C->getAPIntValue();
50023 bool MaskOK = false;
50024 // We can handle cases concerning bit-widening nodes containing setcc_c if
50025 // we carefully interrogate the mask to make sure we are semantics
50026 // preserving.
50027 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
50028 // of the underlying setcc_c operation if the setcc_c was zero extended.
50029 // Consider the following example:
50030 // zext(setcc_c) -> i32 0x0000FFFF
50031 // c1 -> i32 0x0000FFFF
50032 // c2 -> i32 0x00000001
50033 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
50034 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
50035 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
50036 MaskOK = true;
50037 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
50039 MaskOK = true;
50040 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
50041 N00.getOpcode() == ISD::ANY_EXTEND) &&
50043 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
50044 }
50045 if (MaskOK && Mask != 0)
50046 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
50047 }
50048
50049 return SDValue();
50050}
50051
50053 const X86Subtarget &Subtarget) {
50054 using namespace llvm::SDPatternMatch;
50055 SDValue N0 = N->getOperand(0);
50056 SDValue N1 = N->getOperand(1);
50057 EVT VT = N0.getValueType();
50058 unsigned Size = VT.getSizeInBits();
50059 SDLoc DL(N);
50060
50061 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50062 return V;
50063
50064 // fold sra(x,umin(amt,bw-1)) -> avx2 psrav(x,amt)
50065 if (supportedVectorVarShift(VT, Subtarget, ISD::SRA)) {
50066 SDValue ShrAmtVal;
50067 if (sd_match(N1, m_UMin(m_Value(ShrAmtVal),
50069 return DAG.getNode(X86ISD::VSRAV, DL, VT, N0, ShrAmtVal);
50070 }
50071
50072 // fold (SRA (SHL X, ShlConst), SraConst)
50073 // into (SHL (sext_in_reg X), ShlConst - SraConst)
50074 // or (sext_in_reg X)
50075 // or (SRA (sext_in_reg X), SraConst - ShlConst)
50076 // depending on relation between SraConst and ShlConst.
50077 // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
50078 // us to do the sext_in_reg from corresponding bit.
50079
50080 // sexts in X86 are MOVs. The MOVs have the same code size
50081 // as above SHIFTs (only SHIFT on 1 has lower code size).
50082 // However the MOVs have 2 advantages to a SHIFT:
50083 // 1. MOVs can write to a register that differs from source
50084 // 2. MOVs accept memory operands
50085
50086 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
50087 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
50089 return SDValue();
50090
50091 SDValue N00 = N0.getOperand(0);
50092 SDValue N01 = N0.getOperand(1);
50093 APInt ShlConst = N01->getAsAPIntVal();
50094 APInt SraConst = N1->getAsAPIntVal();
50095 EVT CVT = N1.getValueType();
50096
50097 if (CVT != N01.getValueType())
50098 return SDValue();
50099 if (SraConst.isNegative())
50100 return SDValue();
50101
50102 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
50103 unsigned ShiftSize = SVT.getSizeInBits();
50104 // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
50105 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
50106 continue;
50107 SDValue NN =
50108 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
50109 if (SraConst.eq(ShlConst))
50110 return NN;
50111 if (SraConst.ult(ShlConst))
50112 return DAG.getNode(ISD::SHL, DL, VT, NN,
50113 DAG.getConstant(ShlConst - SraConst, DL, CVT));
50114 return DAG.getNode(ISD::SRA, DL, VT, NN,
50115 DAG.getConstant(SraConst - ShlConst, DL, CVT));
50116 }
50117 return SDValue();
50118}
50119
50122 const X86Subtarget &Subtarget) {
50123 using namespace llvm::SDPatternMatch;
50124 SDValue N0 = N->getOperand(0);
50125 SDValue N1 = N->getOperand(1);
50126 EVT VT = N0.getValueType();
50127 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50128 SDLoc DL(N);
50129
50130 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50131 return V;
50132
50133 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
50134 // with out-of-bounds clamping.
50135 if (N0.getOpcode() == ISD::VSELECT &&
50136 supportedVectorVarShift(VT, Subtarget, ISD::SRL)) {
50137 SDValue Cond = N0.getOperand(0);
50138 SDValue N00 = N0.getOperand(1);
50139 SDValue N01 = N0.getOperand(2);
50140 // fold srl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psrlv(x,amt)
50142 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50144 return DAG.getNode(X86ISD::VSRLV, DL, VT, N00, N1);
50145 }
50146 // fold srl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psrlv(x,amt)
50148 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50150 return DAG.getNode(X86ISD::VSRLV, DL, VT, N01, N1);
50151 }
50152 }
50153
50154 // Only do this on the last DAG combine as it can interfere with other
50155 // combines.
50156 if (!DCI.isAfterLegalizeDAG())
50157 return SDValue();
50158
50159 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
50160 // TODO: This is a generic DAG combine that became an x86-only combine to
50161 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
50162 // and-not ('andn').
50163 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
50164 return SDValue();
50165
50166 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
50167 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
50168 if (!ShiftC || !AndC)
50169 return SDValue();
50170
50171 // If we can shrink the constant mask below 8-bits or 32-bits, then this
50172 // transform should reduce code size. It may also enable secondary transforms
50173 // from improved known-bits analysis or instruction selection.
50174 APInt MaskVal = AndC->getAPIntValue();
50175
50176 // If this can be matched by a zero extend, don't optimize.
50177 if (MaskVal.isMask()) {
50178 unsigned TO = MaskVal.countr_one();
50179 if (TO >= 8 && isPowerOf2_32(TO))
50180 return SDValue();
50181 }
50182
50183 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
50184 unsigned OldMaskSize = MaskVal.getSignificantBits();
50185 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
50186 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
50187 (OldMaskSize > 32 && NewMaskSize <= 32)) {
50188 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
50189 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
50190 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
50191 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
50192 }
50193 return SDValue();
50194}
50195
50197 const X86Subtarget &Subtarget) {
50198 unsigned Opcode = N->getOpcode();
50199 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
50200
50201 SDLoc DL(N);
50202 EVT VT = N->getValueType(0);
50203 SDValue N0 = N->getOperand(0);
50204 SDValue N1 = N->getOperand(1);
50205 EVT SrcVT = N0.getValueType();
50206
50207 SDValue BC0 =
50208 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
50209 SDValue BC1 =
50210 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
50211
50212 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
50213 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
50214 // truncation trees that help us avoid lane crossing shuffles.
50215 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
50216 // TODO: We don't handle vXf64 shuffles yet.
50217 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50218 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
50220 SmallVector<int> ShuffleMask, ScaledMask;
50221 SDValue Vec = peekThroughBitcasts(BCSrc);
50222 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
50224 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
50225 // shuffle to a v4X64 width - we can probably relax this in the future.
50226 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
50227 ShuffleOps[0].getValueType().is256BitVector() &&
50228 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
50229 SDValue Lo, Hi;
50230 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50231 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
50232 Lo = DAG.getBitcast(SrcVT, Lo);
50233 Hi = DAG.getBitcast(SrcVT, Hi);
50234 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
50235 Res = DAG.getBitcast(ShufVT, Res);
50236 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
50237 return DAG.getBitcast(VT, Res);
50238 }
50239 }
50240 }
50241 }
50242
50243 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
50244 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50245 // If either/both ops are a shuffle that can scale to v2x64,
50246 // then see if we can perform this as a v4x32 post shuffle.
50247 SmallVector<SDValue> Ops0, Ops1;
50248 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
50249 bool IsShuf0 =
50250 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50251 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50252 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50253 bool IsShuf1 =
50254 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50255 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
50256 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50257 if (IsShuf0 || IsShuf1) {
50258 if (!IsShuf0) {
50259 Ops0.assign({BC0});
50260 ScaledMask0.assign({0, 1});
50261 }
50262 if (!IsShuf1) {
50263 Ops1.assign({BC1});
50264 ScaledMask1.assign({0, 1});
50265 }
50266
50267 SDValue LHS, RHS;
50268 int PostShuffle[4] = {-1, -1, -1, -1};
50269 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
50270 if (M < 0)
50271 return true;
50272 Idx = M % 2;
50273 SDValue Src = Ops[M / 2];
50274 if (!LHS || LHS == Src) {
50275 LHS = Src;
50276 return true;
50277 }
50278 if (!RHS || RHS == Src) {
50279 Idx += 2;
50280 RHS = Src;
50281 return true;
50282 }
50283 return false;
50284 };
50285 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
50286 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
50287 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
50288 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
50289 LHS = DAG.getBitcast(SrcVT, LHS);
50290 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
50291 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50292 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
50293 Res = DAG.getBitcast(ShufVT, Res);
50294 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
50295 return DAG.getBitcast(VT, Res);
50296 }
50297 }
50298 }
50299
50300 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
50301 if (VT.is256BitVector() && Subtarget.hasInt256()) {
50302 SmallVector<int> Mask0, Mask1;
50303 SmallVector<SDValue> Ops0, Ops1;
50304 SmallVector<int, 2> ScaledMask0, ScaledMask1;
50305 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50306 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50307 !Ops0.empty() && !Ops1.empty() &&
50308 all_of(Ops0,
50309 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50310 all_of(Ops1,
50311 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50312 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50313 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
50314 SDValue Op00 = peekThroughBitcasts(Ops0.front());
50315 SDValue Op10 = peekThroughBitcasts(Ops1.front());
50316 SDValue Op01 = peekThroughBitcasts(Ops0.back());
50317 SDValue Op11 = peekThroughBitcasts(Ops1.back());
50318 if ((Op00 == Op11) && (Op01 == Op10)) {
50319 std::swap(Op10, Op11);
50321 }
50322 if ((Op00 == Op10) && (Op01 == Op11)) {
50323 const int Map[4] = {0, 2, 1, 3};
50324 SmallVector<int, 4> ShuffleMask(
50325 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
50326 Map[ScaledMask1[1]]});
50327 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
50328 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
50329 DAG.getBitcast(SrcVT, Op01));
50330 Res = DAG.getBitcast(ShufVT, Res);
50331 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
50332 return DAG.getBitcast(VT, Res);
50333 }
50334 }
50335 }
50336
50337 return SDValue();
50338}
50339
50342 const X86Subtarget &Subtarget) {
50343 unsigned Opcode = N->getOpcode();
50344 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
50345 "Unexpected pack opcode");
50346
50347 EVT VT = N->getValueType(0);
50348 SDValue N0 = N->getOperand(0);
50349 SDValue N1 = N->getOperand(1);
50350 unsigned NumDstElts = VT.getVectorNumElements();
50351 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
50352 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
50353 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
50354 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
50355 "Unexpected PACKSS/PACKUS input type");
50356
50357 bool IsSigned = (X86ISD::PACKSS == Opcode);
50358
50359 // Constant Folding.
50360 APInt UndefElts0, UndefElts1;
50361 SmallVector<APInt, 32> EltBits0, EltBits1;
50362 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
50363 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
50364 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0,
50365 /*AllowWholeUndefs*/ true,
50366 /*AllowPartialUndefs*/ true) &&
50367 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1,
50368 /*AllowWholeUndefs*/ true,
50369 /*AllowPartialUndefs*/ true)) {
50370 unsigned NumLanes = VT.getSizeInBits() / 128;
50371 unsigned NumSrcElts = NumDstElts / 2;
50372 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
50373 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
50374
50375 APInt Undefs(NumDstElts, 0);
50376 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
50377 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
50378 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
50379 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
50380 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
50381 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
50382
50383 if (UndefElts[SrcIdx]) {
50384 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
50385 continue;
50386 }
50387
50388 APInt &Val = EltBits[SrcIdx];
50389 if (IsSigned) {
50390 // PACKSS: Truncate signed value with signed saturation.
50391 // Source values less than dst minint are saturated to minint.
50392 // Source values greater than dst maxint are saturated to maxint.
50393 Val = Val.truncSSat(DstBitsPerElt);
50394 } else {
50395 // PACKUS: Truncate signed value with unsigned saturation.
50396 // Source values less than zero are saturated to zero.
50397 // Source values greater than dst maxuint are saturated to maxuint.
50398 // NOTE: This is different from APInt::truncUSat.
50399 if (Val.isIntN(DstBitsPerElt))
50400 Val = Val.trunc(DstBitsPerElt);
50401 else if (Val.isNegative())
50402 Val = APInt::getZero(DstBitsPerElt);
50403 else
50404 Val = APInt::getAllOnes(DstBitsPerElt);
50405 }
50406 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
50407 }
50408 }
50409
50410 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
50411 }
50412
50413 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
50414 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50415 return V;
50416
50417 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
50418 // Currently limit this to allsignbits cases only.
50419 if (IsSigned &&
50420 (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&
50421 (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {
50422 SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);
50423 SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);
50424 if (Not0 && Not1) {
50425 SDLoc DL(N);
50426 MVT SrcVT = N0.getSimpleValueType();
50427 SDValue Pack =
50428 DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),
50429 DAG.getBitcast(SrcVT, Not1));
50430 return DAG.getNOT(DL, Pack, VT);
50431 }
50432 }
50433
50434 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
50435 // truncate to create a larger truncate.
50436 if (Subtarget.hasAVX512() &&
50437 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
50438 N0.getOperand(0).getValueType() == MVT::v8i32) {
50439 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
50440 (!IsSigned &&
50441 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
50442 if (Subtarget.hasVLX())
50443 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
50444
50445 // Widen input to v16i32 so we can truncate that.
50446 SDLoc dl(N);
50447 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
50448 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
50449 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
50450 }
50451 }
50452
50453 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
50454 if (VT.is128BitVector()) {
50455 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
50456 SDValue Src0, Src1;
50457 if (N0.getOpcode() == ExtOpc &&
50459 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50460 Src0 = N0.getOperand(0);
50461 }
50462 if (N1.getOpcode() == ExtOpc &&
50464 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50465 Src1 = N1.getOperand(0);
50466 }
50467 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
50468 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
50469 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
50470 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
50471 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
50472 }
50473
50474 // Try again with pack(*_extend_vector_inreg, undef).
50475 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
50477 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
50478 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
50479 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
50480 DAG);
50481 }
50482
50483 // Attempt to combine as shuffle.
50484 SDValue Op(N, 0);
50485 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50486 return Res;
50487
50488 return SDValue();
50489}
50490
50493 const X86Subtarget &Subtarget) {
50494 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
50495 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
50496 "Unexpected horizontal add/sub opcode");
50497
50498 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
50499 MVT VT = N->getSimpleValueType(0);
50500 SDValue LHS = N->getOperand(0);
50501 SDValue RHS = N->getOperand(1);
50502
50503 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
50504 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
50505 LHS.getOpcode() == RHS.getOpcode() &&
50506 LHS.getValueType() == RHS.getValueType() &&
50507 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
50508 SDValue LHS0 = LHS.getOperand(0);
50509 SDValue LHS1 = LHS.getOperand(1);
50510 SDValue RHS0 = RHS.getOperand(0);
50511 SDValue RHS1 = RHS.getOperand(1);
50512 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
50513 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
50514 SDLoc DL(N);
50515 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
50516 LHS0.isUndef() ? LHS1 : LHS0,
50517 RHS0.isUndef() ? RHS1 : RHS0);
50518 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
50519 Res = DAG.getBitcast(ShufVT, Res);
50520 SDValue NewLHS =
50521 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50522 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
50523 SDValue NewRHS =
50524 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50525 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
50526 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
50527 DAG.getBitcast(VT, NewRHS));
50528 }
50529 }
50530 }
50531
50532 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
50533 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50534 return V;
50535
50536 return SDValue();
50537}
50538
50541 const X86Subtarget &Subtarget) {
50542 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
50543 X86ISD::VSRL == N->getOpcode()) &&
50544 "Unexpected shift opcode");
50545 EVT VT = N->getValueType(0);
50546 SDValue N0 = N->getOperand(0);
50547 SDValue N1 = N->getOperand(1);
50548
50549 // Shift zero -> zero.
50551 return DAG.getConstant(0, SDLoc(N), VT);
50552
50553 // Detect constant shift amounts.
50554 APInt UndefElts;
50555 SmallVector<APInt, 32> EltBits;
50556 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits,
50557 /*AllowWholeUndefs*/ true,
50558 /*AllowPartialUndefs*/ false)) {
50559 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
50560 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
50561 EltBits[0].getZExtValue(), DAG);
50562 }
50563
50564 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50565 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
50566 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
50567 return SDValue(N, 0);
50568
50569 return SDValue();
50570}
50571
50574 const X86Subtarget &Subtarget) {
50575 unsigned Opcode = N->getOpcode();
50576 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
50577 X86ISD::VSRLI == Opcode) &&
50578 "Unexpected shift opcode");
50579 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
50580 EVT VT = N->getValueType(0);
50581 SDValue N0 = N->getOperand(0);
50582 SDValue N1 = N->getOperand(1);
50583 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50584 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
50585 "Unexpected value type");
50586 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
50587
50588 // (shift undef, X) -> 0
50589 if (N0.isUndef())
50590 return DAG.getConstant(0, SDLoc(N), VT);
50591
50592 // Out of range logical bit shifts are guaranteed to be zero.
50593 // Out of range arithmetic bit shifts splat the sign bit.
50594 unsigned ShiftVal = N->getConstantOperandVal(1);
50595 if (ShiftVal >= NumBitsPerElt) {
50596 if (LogicalShift)
50597 return DAG.getConstant(0, SDLoc(N), VT);
50598 ShiftVal = NumBitsPerElt - 1;
50599 }
50600
50601 // (shift X, 0) -> X
50602 if (!ShiftVal)
50603 return N0;
50604
50605 // (shift 0, C) -> 0
50607 // N0 is all zeros or undef. We guarantee that the bits shifted into the
50608 // result are all zeros, not undef.
50609 return DAG.getConstant(0, SDLoc(N), VT);
50610
50611 // (VSRAI -1, C) -> -1
50612 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
50613 // N0 is all ones or undef. We guarantee that the bits shifted into the
50614 // result are all ones, not undef.
50615 return DAG.getAllOnesConstant(SDLoc(N), VT);
50616
50617 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
50618 unsigned NewShiftVal = Amt0 + Amt1;
50619 if (NewShiftVal >= NumBitsPerElt) {
50620 // Out of range logical bit shifts are guaranteed to be zero.
50621 // Out of range arithmetic bit shifts splat the sign bit.
50622 if (LogicalShift)
50623 return DAG.getConstant(0, SDLoc(N), VT);
50624 NewShiftVal = NumBitsPerElt - 1;
50625 }
50626 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
50627 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
50628 };
50629
50630 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
50631 if (Opcode == N0.getOpcode())
50632 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
50633
50634 // (shl (add X, X), C) -> (shl X, (C + 1))
50635 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
50636 N0.getOperand(0) == N0.getOperand(1))
50637 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
50638
50639 // We can decode 'whole byte' logical bit shifts as shuffles.
50640 if (LogicalShift && (ShiftVal % 8) == 0) {
50641 SDValue Op(N, 0);
50642 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50643 return Res;
50644 }
50645
50646 // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
50647 // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
50648 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
50649 // pshufd(psrad(pslld(X,31),31),0,0,2,2).
50650 if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
50651 N0.getOpcode() == X86ISD::PSHUFD &&
50652 N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
50653 N0->hasOneUse()) {
50655 if (BC.getOpcode() == X86ISD::VSHLI &&
50656 BC.getScalarValueSizeInBits() == 64 &&
50657 BC.getConstantOperandVal(1) == 63) {
50658 SDLoc DL(N);
50659 SDValue Src = BC.getOperand(0);
50660 Src = DAG.getBitcast(VT, Src);
50661 Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
50662 getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
50663 Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
50664 Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
50665 return Src;
50666 }
50667 }
50668
50669 auto TryConstantFold = [&](SDValue V) {
50670 APInt UndefElts;
50671 SmallVector<APInt, 32> EltBits;
50672 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits,
50673 /*AllowWholeUndefs*/ true,
50674 /*AllowPartialUndefs*/ true))
50675 return SDValue();
50676 assert(EltBits.size() == VT.getVectorNumElements() &&
50677 "Unexpected shift value type");
50678 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
50679 // created an undef input due to no input bits being demanded, but user
50680 // still expects 0 in other bits.
50681 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
50682 APInt &Elt = EltBits[i];
50683 if (UndefElts[i])
50684 Elt = 0;
50685 else if (X86ISD::VSHLI == Opcode)
50686 Elt <<= ShiftVal;
50687 else if (X86ISD::VSRAI == Opcode)
50688 Elt.ashrInPlace(ShiftVal);
50689 else
50690 Elt.lshrInPlace(ShiftVal);
50691 }
50692 // Reset undef elements since they were zeroed above.
50693 UndefElts = 0;
50694 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
50695 };
50696
50697 // Constant Folding.
50698 if (N->isOnlyUserOf(N0.getNode())) {
50699 if (SDValue C = TryConstantFold(N0))
50700 return C;
50701
50702 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
50703 // Don't break NOT patterns.
50705 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
50706 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
50708 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
50709 SDLoc DL(N);
50710 SDValue LHS = DAG.getNode(Opcode, DL, VT,
50711 DAG.getBitcast(VT, BC.getOperand(0)), N1);
50712 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
50713 }
50714 }
50715 }
50716
50717 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50718 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
50719 DCI))
50720 return SDValue(N, 0);
50721
50722 return SDValue();
50723}
50724
50727 const X86Subtarget &Subtarget) {
50728 EVT VT = N->getValueType(0);
50729 unsigned Opcode = N->getOpcode();
50730 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
50731 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
50732 Opcode == ISD::INSERT_VECTOR_ELT) &&
50733 "Unexpected vector insertion");
50734
50735 SDValue Vec = N->getOperand(0);
50736 SDValue Scl = N->getOperand(1);
50737 SDValue Idx = N->getOperand(2);
50738
50739 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
50740 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
50741 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
50742
50743 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
50744 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50745 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50746 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
50747 APInt::getAllOnes(NumBitsPerElt), DCI))
50748 return SDValue(N, 0);
50749 }
50750
50751 // Attempt to combine insertion patterns to a shuffle.
50752 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
50753 SDValue Op(N, 0);
50754 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50755 return Res;
50756 }
50757
50758 return SDValue();
50759}
50760
50761/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
50762/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
50763/// OR -> CMPNEQSS.
50766 const X86Subtarget &Subtarget) {
50767 unsigned opcode;
50768
50769 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
50770 // we're requiring SSE2 for both.
50771 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
50772 SDValue N0 = N->getOperand(0);
50773 SDValue N1 = N->getOperand(1);
50774 SDValue CMP0 = N0.getOperand(1);
50775 SDValue CMP1 = N1.getOperand(1);
50776 SDLoc DL(N);
50777
50778 // The SETCCs should both refer to the same CMP.
50779 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
50780 return SDValue();
50781
50782 SDValue CMP00 = CMP0->getOperand(0);
50783 SDValue CMP01 = CMP0->getOperand(1);
50784 EVT VT = CMP00.getValueType();
50785
50786 if (VT == MVT::f32 || VT == MVT::f64 ||
50787 (VT == MVT::f16 && Subtarget.hasFP16())) {
50788 bool ExpectingFlags = false;
50789 // Check for any users that want flags:
50790 for (const SDNode *U : N->users()) {
50791 if (ExpectingFlags)
50792 break;
50793
50794 switch (U->getOpcode()) {
50795 default:
50796 case ISD::BR_CC:
50797 case ISD::BRCOND:
50798 case ISD::SELECT:
50799 ExpectingFlags = true;
50800 break;
50801 case ISD::CopyToReg:
50802 case ISD::SIGN_EXTEND:
50803 case ISD::ZERO_EXTEND:
50804 case ISD::ANY_EXTEND:
50805 break;
50806 }
50807 }
50808
50809 if (!ExpectingFlags) {
50810 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
50811 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
50812
50813 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
50814 X86::CondCode tmp = cc0;
50815 cc0 = cc1;
50816 cc1 = tmp;
50817 }
50818
50819 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
50820 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
50821 // FIXME: need symbolic constants for these magic numbers.
50822 // See X86ATTInstPrinter.cpp:printSSECC().
50823 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
50824 if (Subtarget.hasAVX512()) {
50825 SDValue FSetCC =
50826 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
50827 DAG.getTargetConstant(x86cc, DL, MVT::i8));
50828 // Need to fill with zeros to ensure the bitcast will produce zeroes
50829 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
50830 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
50831 DAG.getConstant(0, DL, MVT::v16i1),
50832 FSetCC, DAG.getVectorIdxConstant(0, DL));
50833 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
50834 N->getSimpleValueType(0));
50835 }
50836 SDValue OnesOrZeroesF =
50837 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
50838 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
50839
50840 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
50841 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
50842
50843 if (is64BitFP && !Subtarget.is64Bit()) {
50844 // On a 32-bit target, we cannot bitcast the 64-bit float to a
50845 // 64-bit integer, since that's not a legal type. Since
50846 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
50847 // bits, but can do this little dance to extract the lowest 32 bits
50848 // and work with those going forward.
50849 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL,
50850 MVT::v2f64, OnesOrZeroesF);
50851 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
50852 OnesOrZeroesF =
50853 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Vector32,
50854 DAG.getVectorIdxConstant(0, DL));
50855 IntVT = MVT::i32;
50856 }
50857
50858 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
50859 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
50860 DAG.getConstant(1, DL, IntVT));
50861 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
50862 ANDed);
50863 return OneBitOfTruth;
50864 }
50865 }
50866 }
50867 }
50868 return SDValue();
50869}
50870
50871/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
50873 SelectionDAG &DAG) {
50874 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50875
50876 MVT VT = N->getSimpleValueType(0);
50877 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
50878 return SDValue();
50879
50880 SDValue X, Y;
50881 SDValue N0 = N->getOperand(0);
50882 SDValue N1 = N->getOperand(1);
50883
50884 if (SDValue Not = IsNOT(N0, DAG)) {
50885 X = Not;
50886 Y = N1;
50887 } else if (SDValue Not = IsNOT(N1, DAG)) {
50888 X = Not;
50889 Y = N0;
50890 } else
50891 return SDValue();
50892
50893 X = DAG.getBitcast(VT, X);
50894 Y = DAG.getBitcast(VT, Y);
50895 return DAG.getNode(X86ISD::ANDNP, DL, VT, X, Y);
50896}
50897
50898/// Try to fold:
50899/// and (vector_shuffle<Z,...,Z>
50900/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
50901/// ->
50902/// andnp (vector_shuffle<Z,...,Z>
50903/// (insert_vector_elt undef, X, Z), undef), Y
50905 const X86Subtarget &Subtarget) {
50906 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50907
50908 EVT VT = N->getValueType(0);
50909 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
50910 // value and require extra moves.
50911 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
50912 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
50913 return SDValue();
50914
50915 auto GetNot = [&DAG](SDValue V) {
50917 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
50918 // end-users are ISD::AND including cases
50919 // (and(extract_vector_element(SVN), Y)).
50920 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
50921 !SVN->getOperand(1).isUndef()) {
50922 return SDValue();
50923 }
50924 SDValue IVEN = SVN->getOperand(0);
50925 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
50926 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
50927 return SDValue();
50928 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
50929 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
50930 return SDValue();
50931 SDValue Src = IVEN.getOperand(1);
50932 if (SDValue Not = IsNOT(Src, DAG)) {
50933 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
50934 SDValue NotIVEN =
50936 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
50937 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
50938 SVN->getOperand(1), SVN->getMask());
50939 }
50940 return SDValue();
50941 };
50942
50943 SDValue X, Y;
50944 SDValue N0 = N->getOperand(0);
50945 SDValue N1 = N->getOperand(1);
50946 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50947
50948 if (SDValue Not = GetNot(N0)) {
50949 X = Not;
50950 Y = N1;
50951 } else if (SDValue Not = GetNot(N1)) {
50952 X = Not;
50953 Y = N0;
50954 } else
50955 return SDValue();
50956
50957 X = DAG.getBitcast(VT, X);
50958 Y = DAG.getBitcast(VT, Y);
50959 SDLoc DL(N);
50960
50961 // We do not split for SSE at all, but we need to split vectors for AVX1 and
50962 // AVX2.
50963 if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
50965 SDValue LoX, HiX;
50966 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
50967 SDValue LoY, HiY;
50968 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
50969 EVT SplitVT = LoX.getValueType();
50970 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
50971 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
50972 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
50973 }
50974
50975 if (TLI.isTypeLegal(VT))
50976 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
50977
50978 return SDValue();
50979}
50980
50981// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
50982// logical operations, like in the example below.
50983// or (and (truncate x, truncate y)),
50984// (xor (truncate z, build_vector (constants)))
50985// Given a target type \p VT, we generate
50986// or (and x, y), (xor z, zext(build_vector (constants)))
50987// given x, y and z are of type \p VT. We can do so, if operands are either
50988// truncates from VT types, the second operand is a vector of constants, can
50989// be recursively promoted or is an existing extension we can extend further.
50991 SelectionDAG &DAG,
50992 const X86Subtarget &Subtarget,
50993 unsigned Depth) {
50994 // Limit recursion to avoid excessive compile times.
50996 return SDValue();
50997
50998 if (!ISD::isBitwiseLogicOp(N.getOpcode()))
50999 return SDValue();
51000
51001 SDValue N0 = N.getOperand(0);
51002 SDValue N1 = N.getOperand(1);
51003
51004 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51005 if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
51006 return SDValue();
51007
51008 if (SDValue NN0 =
51009 PromoteMaskArithmetic(N0, DL, VT, DAG, Subtarget, Depth + 1))
51010 N0 = NN0;
51011 else {
51012 // The left side has to be a 'trunc'.
51013 bool LHSTrunc = N0.getOpcode() == ISD::TRUNCATE &&
51014 N0.getOperand(0).getValueType() == VT;
51015 if (LHSTrunc)
51016 N0 = N0.getOperand(0);
51017 else
51018 return SDValue();
51019 }
51020
51021 if (SDValue NN1 =
51022 PromoteMaskArithmetic(N1, DL, VT, DAG, Subtarget, Depth + 1))
51023 N1 = NN1;
51024 else {
51025 // The right side has to be a 'trunc', a (foldable) constant or an
51026 // existing extension we can extend further.
51027 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
51028 N1.getOperand(0).getValueType() == VT;
51029 if (RHSTrunc)
51030 N1 = N1.getOperand(0);
51031 else if (ISD::isExtVecInRegOpcode(N1.getOpcode()) && VT.is256BitVector() &&
51032 Subtarget.hasInt256() && N1.hasOneUse())
51033 N1 = DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0));
51034 else if (SDValue Cst =
51036 N1 = Cst;
51037 else
51038 return SDValue();
51039 }
51040
51041 return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
51042}
51043
51044// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
51045// register. In most cases we actually compare or select YMM-sized registers
51046// and mixing the two types creates horrible code. This method optimizes
51047// some of the transition sequences.
51048// Even with AVX-512 this is still useful for removing casts around logical
51049// operations on vXi1 mask types.
51051 SelectionDAG &DAG,
51052 const X86Subtarget &Subtarget) {
51053 EVT VT = N.getValueType();
51054 assert(VT.isVector() && "Expected vector type");
51055 assert((N.getOpcode() == ISD::ANY_EXTEND ||
51056 N.getOpcode() == ISD::ZERO_EXTEND ||
51057 N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
51058
51059 SDValue Narrow = N.getOperand(0);
51060 EVT NarrowVT = Narrow.getValueType();
51061
51062 // Generate the wide operation.
51063 SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, Subtarget, 0);
51064 if (!Op)
51065 return SDValue();
51066 switch (N.getOpcode()) {
51067 default: llvm_unreachable("Unexpected opcode");
51068 case ISD::ANY_EXTEND:
51069 return Op;
51070 case ISD::ZERO_EXTEND:
51071 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
51072 case ISD::SIGN_EXTEND:
51073 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
51074 Op, DAG.getValueType(NarrowVT));
51075 }
51076}
51077
51078static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
51079 unsigned FPOpcode;
51080 switch (Opcode) {
51081 // clang-format off
51082 default: llvm_unreachable("Unexpected input node for FP logic conversion");
51083 case ISD::AND: FPOpcode = X86ISD::FAND; break;
51084 case ISD::OR: FPOpcode = X86ISD::FOR; break;
51085 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
51086 // clang-format on
51087 }
51088 return FPOpcode;
51089}
51090
51091/// If both input operands of a logic op are being cast from floating-point
51092/// types or FP compares, try to convert this into a floating-point logic node
51093/// to avoid unnecessary moves from SSE to integer registers.
51094static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT,
51095 SDValue N0, SDValue N1,
51096 SelectionDAG &DAG,
51098 const X86Subtarget &Subtarget) {
51099 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51100 "Unexpected bit opcode");
51101
51102 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
51103 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
51104 return SDValue();
51105
51106 SDValue N00 = N0.getOperand(0);
51107 SDValue N10 = N1.getOperand(0);
51108 EVT N00Type = N00.getValueType();
51109 EVT N10Type = N10.getValueType();
51110
51111 // Ensure that both types are the same and are legal scalar fp types.
51112 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
51113 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
51114 (Subtarget.hasFP16() && N00Type == MVT::f16)))
51115 return SDValue();
51116
51117 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
51118 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(Opc);
51119 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
51120 return DAG.getBitcast(VT, FPLogic);
51121 }
51122
51123 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
51124 !N1.hasOneUse())
51125 return SDValue();
51126
51127 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
51128 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
51129
51130 // The vector ISA for FP predicates is incomplete before AVX, so converting
51131 // COMIS* to CMPS* may not be a win before AVX.
51132 if (!Subtarget.hasAVX() &&
51133 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
51134 return SDValue();
51135
51136 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
51137 // and vector logic:
51138 // logic (setcc N00, N01), (setcc N10, N11) -->
51139 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
51140 unsigned NumElts = 128 / N00Type.getSizeInBits();
51141 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
51142 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
51143 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
51144 SDValue N01 = N0.getOperand(1);
51145 SDValue N11 = N1.getOperand(1);
51146 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
51147 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
51148 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
51149 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
51150 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
51151 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
51152 SDValue Logic = DAG.getNode(Opc, DL, BoolVecVT, Setcc0, Setcc1);
51153 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
51154}
51155
51156// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
51157// to reduce XMM->GPR traffic.
51158static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0,
51159 SDValue N1, SelectionDAG &DAG) {
51160 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51161 "Unexpected bit opcode");
51162
51163 // Both operands must be single use MOVMSK.
51164 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
51165 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
51166 return SDValue();
51167
51168 SDValue Vec0 = N0.getOperand(0);
51169 SDValue Vec1 = N1.getOperand(0);
51170 EVT VecVT0 = Vec0.getValueType();
51171 EVT VecVT1 = Vec1.getValueType();
51172
51173 // Both MOVMSK operands must be from vectors of the same size and same element
51174 // size, but its OK for a fp/int diff.
51175 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
51176 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
51177 return SDValue();
51178
51179 unsigned VecOpc =
51181 SDValue Result =
51182 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
51183 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
51184}
51185
51186// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
51187// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
51188// handles in InstCombine.
51189static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT,
51190 SDValue N0, SDValue N1,
51191 SelectionDAG &DAG) {
51192 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51193 "Unexpected bit opcode");
51194
51195 // Both operands must be single use.
51196 if (!N0.hasOneUse() || !N1.hasOneUse())
51197 return SDValue();
51198
51199 // Search for matching shifts.
51202
51203 unsigned BCOpc = BC0.getOpcode();
51204 EVT BCVT = BC0.getValueType();
51205 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
51206 return SDValue();
51207
51208 switch (BCOpc) {
51209 case X86ISD::VSHLI:
51210 case X86ISD::VSRLI:
51211 case X86ISD::VSRAI: {
51212 if (BC0.getOperand(1) != BC1.getOperand(1))
51213 return SDValue();
51214 SDValue BitOp =
51215 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
51216 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
51217 return DAG.getBitcast(VT, Shift);
51218 }
51219 }
51220
51221 return SDValue();
51222}
51223
51224// Attempt to fold:
51225// BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
51226// TODO: Handle PACKUS handling.
51227static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT,
51228 SDValue N0, SDValue N1, SelectionDAG &DAG) {
51229 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51230 "Unexpected bit opcode");
51231
51232 // Both operands must be single use.
51233 if (!N0.hasOneUse() || !N1.hasOneUse())
51234 return SDValue();
51235
51236 // Search for matching packs.
51239
51240 if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
51241 return SDValue();
51242
51243 MVT DstVT = N0.getSimpleValueType();
51244 if (DstVT != N1.getSimpleValueType())
51245 return SDValue();
51246
51247 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
51248 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
51249
51250 // Limit to allsignbits packing.
51251 if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
51252 DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
51253 DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
51254 DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
51255 return SDValue();
51256
51257 SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
51258 SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
51259 return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
51260}
51261
51262/// If this is a zero/all-bits result that is bitwise-anded with a low bits
51263/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
51264/// with a shift-right to eliminate loading the vector constant mask value.
51266 SelectionDAG &DAG,
51267 const X86Subtarget &Subtarget) {
51268 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
51269 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
51270 EVT VT = Op0.getValueType();
51271 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
51272 return SDValue();
51273
51274 // Try to convert an "is positive" signbit masking operation into arithmetic
51275 // shift and "andn". This saves a materialization of a -1 vector constant.
51276 // The "is negative" variant should be handled more generally because it only
51277 // requires "and" rather than "andn":
51278 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
51279 //
51280 // This is limited to the original type to avoid producing even more bitcasts.
51281 // If the bitcasts can't be eliminated, then it is unlikely that this fold
51282 // will be profitable.
51283 if (N->getValueType(0) == VT &&
51284 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
51285 SDValue X, Y;
51286 if (Op1.getOpcode() == X86ISD::PCMPGT &&
51287 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
51288 X = Op1.getOperand(0);
51289 Y = Op0;
51290 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
51291 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
51292 X = Op0.getOperand(0);
51293 Y = Op1;
51294 }
51295 if (X && Y) {
51296 SDValue Sra =
51298 VT.getScalarSizeInBits() - 1, DAG);
51299 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
51300 }
51301 }
51302
51303 APInt SplatVal;
51304 if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())
51305 return SDValue();
51306
51307 // Don't prevent creation of ANDN.
51308 if (isBitwiseNot(Op0))
51309 return SDValue();
51310
51311 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
51312 return SDValue();
51313
51314 unsigned EltBitWidth = VT.getScalarSizeInBits();
51315 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
51316 return SDValue();
51317
51318 unsigned ShiftVal = SplatVal.countr_one();
51319 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
51320 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
51321 return DAG.getBitcast(N->getValueType(0), Shift);
51322}
51323
51324// Get the index node from the lowered DAG of a GEP IR instruction with one
51325// indexing dimension.
51327 if (Ld->isIndexed())
51328 return SDValue();
51329
51330 SDValue Base = Ld->getBasePtr();
51331 if (Base.getOpcode() != ISD::ADD)
51332 return SDValue();
51333
51334 SDValue ShiftedIndex = Base.getOperand(0);
51335 if (ShiftedIndex.getOpcode() != ISD::SHL)
51336 return SDValue();
51337
51338 return ShiftedIndex.getOperand(0);
51339}
51340
51341static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
51342 return Subtarget.hasBMI2() &&
51343 (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
51344}
51345
51346/// Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z))
51347/// This undoes the inverse fold performed in InstCombine
51349 SelectionDAG &DAG) {
51350 using namespace llvm::SDPatternMatch;
51351 MVT VT = N->getSimpleValueType(0);
51352 if (!DAG.getTargetLoweringInfo().hasAndNot(SDValue(N, 0)))
51353 return SDValue();
51354
51355 SDValue X, Y, Z;
51356 if (sd_match(N, m_And(m_Value(X),
51357 m_OneUse(m_Or(m_Value(Y), m_Not(m_Value(Z))))))) {
51358 // Don't fold if Y or Z are constants to prevent infinite loops.
51361 return DAG.getNode(
51362 ISD::AND, DL, VT, X,
51363 DAG.getNOT(
51364 DL, DAG.getNode(ISD::AND, DL, VT, DAG.getNOT(DL, Y, VT), Z), VT));
51365 }
51366
51367 return SDValue();
51368}
51369
51370// This function recognizes cases where X86 bzhi instruction can replace and
51371// 'and-load' sequence.
51372// In case of loading integer value from an array of constants which is defined
51373// as follows:
51374//
51375// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
51376//
51377// then applying a bitwise and on the result with another input.
51378// It's equivalent to performing bzhi (zero high bits) on the input, with the
51379// same index of the load.
51381 const X86Subtarget &Subtarget) {
51382 MVT VT = Node->getSimpleValueType(0);
51383 SDLoc dl(Node);
51384
51385 // Check if subtarget has BZHI instruction for the node's type
51386 if (!hasBZHI(Subtarget, VT))
51387 return SDValue();
51388
51389 // Try matching the pattern for both operands.
51390 for (unsigned i = 0; i < 2; i++) {
51391 // continue if the operand is not a load instruction
51392 auto *Ld = dyn_cast<LoadSDNode>(Node->getOperand(i));
51393 if (!Ld)
51394 continue;
51395 const Value *MemOp = Ld->getMemOperand()->getValue();
51396 if (!MemOp)
51397 continue;
51398 // Get the Node which indexes into the array.
51400 if (!Index)
51401 continue;
51402
51403 if (auto *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
51404 if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
51405 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
51406 Constant *Init = GV->getInitializer();
51407 Type *Ty = Init->getType();
51409 !Ty->getArrayElementType()->isIntegerTy() ||
51410 Ty->getArrayElementType()->getScalarSizeInBits() !=
51411 VT.getSizeInBits() ||
51412 Ty->getArrayNumElements() >
51413 Ty->getArrayElementType()->getScalarSizeInBits())
51414 continue;
51415
51416 // Check if the array's constant elements are suitable to our case.
51417 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
51418 bool ConstantsMatch = true;
51419 for (uint64_t j = 0; j < ArrayElementCount; j++) {
51420 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
51421 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
51422 ConstantsMatch = false;
51423 break;
51424 }
51425 }
51426 if (!ConstantsMatch)
51427 continue;
51428
51429 // Do the transformation (For 32-bit type):
51430 // -> (and (load arr[idx]), inp)
51431 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
51432 // that will be replaced with one bzhi instruction.
51433 SDValue Inp = Node->getOperand(i == 0 ? 1 : 0);
51434 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
51435
51436 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
51437 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
51438 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
51439
51440 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
51441 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
51442 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
51443 }
51444 }
51445 }
51446 }
51447 return SDValue();
51448}
51449
51450// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
51451// Where C is a mask containing the same number of bits as the setcc and
51452// where the setcc will freely 0 upper bits of k-register. We can replace the
51453// undef in the concat with 0s and remove the AND. This mainly helps with
51454// v2i1/v4i1 setcc being casted to scalar.
51456 const X86Subtarget &Subtarget) {
51457 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
51458
51459 EVT VT = N->getValueType(0);
51460
51461 // Make sure this is an AND with constant. We will check the value of the
51462 // constant later.
51463 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
51464 if (!C1)
51465 return SDValue();
51466
51467 // This is implied by the ConstantSDNode.
51468 assert(!VT.isVector() && "Expected scalar VT!");
51469
51470 SDValue Src = N->getOperand(0);
51471 if (!Src.hasOneUse())
51472 return SDValue();
51473
51474 // (Optionally) peek through any_extend().
51475 if (Src.getOpcode() == ISD::ANY_EXTEND) {
51476 if (!Src.getOperand(0).hasOneUse())
51477 return SDValue();
51478 Src = Src.getOperand(0);
51479 }
51480
51481 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
51482 return SDValue();
51483
51484 Src = Src.getOperand(0);
51485 EVT SrcVT = Src.getValueType();
51486
51487 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51488 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
51489 !TLI.isTypeLegal(SrcVT))
51490 return SDValue();
51491
51492 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
51493 return SDValue();
51494
51495 // We only care about the first subvector of the concat, we expect the
51496 // other subvectors to be ignored due to the AND if we make the change.
51497 SDValue SubVec = Src.getOperand(0);
51498 EVT SubVecVT = SubVec.getValueType();
51499
51500 // The RHS of the AND should be a mask with as many bits as SubVec.
51501 if (!TLI.isTypeLegal(SubVecVT) ||
51502 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
51503 return SDValue();
51504
51505 // First subvector should be a setcc with a legal result type or a
51506 // AND containing at least one setcc with a legal result type.
51507 auto IsLegalSetCC = [&](SDValue V) {
51508 if (V.getOpcode() != ISD::SETCC)
51509 return false;
51510 EVT SetccVT = V.getOperand(0).getValueType();
51511 if (!TLI.isTypeLegal(SetccVT) ||
51512 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
51513 return false;
51514 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
51515 return false;
51516 return true;
51517 };
51518 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
51519 (IsLegalSetCC(SubVec.getOperand(0)) ||
51520 IsLegalSetCC(SubVec.getOperand(1))))))
51521 return SDValue();
51522
51523 // We passed all the checks. Rebuild the concat_vectors with zeroes
51524 // and cast it back to VT.
51525 SDLoc dl(N);
51526 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
51527 DAG.getConstant(0, dl, SubVecVT));
51528 Ops[0] = SubVec;
51529 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
51530 Ops);
51531 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
51532 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
51533}
51534
51536 SDValue OpMustEq, SDValue Op, unsigned Depth) {
51537 // We don't want to go crazy with the recursion here. This isn't a super
51538 // important optimization.
51539 static constexpr unsigned kMaxDepth = 2;
51540
51541 // Only do this re-ordering if op has one use.
51542 if (!Op.hasOneUse())
51543 return SDValue();
51544
51545 SDLoc DL(Op);
51546 // If we hit another assosiative op, recurse further.
51547 if (Op.getOpcode() == Opc) {
51548 // Done recursing.
51549 if (Depth++ >= kMaxDepth)
51550 return SDValue();
51551
51552 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51553 if (SDValue R =
51554 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
51555 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
51556 Op.getOperand(1 - OpIdx));
51557
51558 } else if (Op.getOpcode() == ISD::SUB) {
51559 if (Opc == ISD::AND) {
51560 // BLSI: (and x, (sub 0, x))
51561 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
51562 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51563 }
51564 // Opc must be ISD::AND or ISD::XOR
51565 // BLSR: (and x, (sub x, 1))
51566 // BLSMSK: (xor x, (sub x, 1))
51567 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51568 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51569
51570 } else if (Op.getOpcode() == ISD::ADD) {
51571 // Opc must be ISD::AND or ISD::XOR
51572 // BLSR: (and x, (add x, -1))
51573 // BLSMSK: (xor x, (add x, -1))
51574 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51575 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51576 }
51577 return SDValue();
51578}
51579
51581 const X86Subtarget &Subtarget) {
51582 EVT VT = N->getValueType(0);
51583 // Make sure this node is a candidate for BMI instructions.
51584 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
51585 (VT != MVT::i32 && VT != MVT::i64))
51586 return SDValue();
51587
51588 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
51589
51590 // Try and match LHS and RHS.
51591 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51592 if (SDValue OpMatch =
51593 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
51594 N->getOperand(1 - OpIdx), 0))
51595 return OpMatch;
51596 return SDValue();
51597}
51598
51599/// Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
51601 SelectionDAG &DAG,
51602 const X86Subtarget &Subtarget) {
51603 using namespace llvm::SDPatternMatch;
51604
51605 EVT VT = And->getValueType(0);
51606 // Make sure this node is a candidate for BMI instructions.
51607 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
51608 return SDValue();
51609
51610 SDValue X;
51611 SDValue Y;
51614 m_Value(Y))))
51615 return SDValue();
51616
51617 SDValue BLSMSK =
51618 DAG.getNode(ISD::XOR, DL, VT, X,
51619 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getConstant(1, DL, VT)));
51620 SDValue AndN = DAG.getNode(ISD::AND, DL, VT, Y, DAG.getNOT(DL, BLSMSK, VT));
51621 return AndN;
51622}
51623
51625 SelectionDAG &DAG,
51627 const X86Subtarget &ST) {
51628 // cmp(setcc(cc, X), 0)
51629 // brcond ne
51630 // ->
51631 // X
51632 // brcond cc
51633
51634 // sub(setcc(cc, X), 1)
51635 // brcond ne
51636 // ->
51637 // X
51638 // brcond ~cc
51639 //
51640 // if only flag has users
51641
51642 SDValue SetCC = N->getOperand(0);
51643
51644 if (SetCC.getOpcode() != X86ISD::SETCC || !Flag.hasOneUse())
51645 return SDValue();
51646
51647 // Check the only user of flag is `brcond ne`.
51648 SDNode *BrCond = *Flag->user_begin();
51649 if (BrCond->getOpcode() != X86ISD::BRCOND)
51650 return SDValue();
51651 unsigned CondNo = 2;
51652 if (static_cast<X86::CondCode>(BrCond->getConstantOperandVal(CondNo)) !=
51654 return SDValue();
51655
51656 SDValue X = SetCC.getOperand(1);
51657 // sub has two results while X only have one. DAG combine assumes the value
51658 // type matches.
51659 if (N->getOpcode() == X86ISD::SUB)
51660 X = DAG.getMergeValues({N->getOperand(0), X}, SDLoc(N));
51661
51662 SDValue CCN = SetCC.getOperand(0);
51663 X86::CondCode CC =
51664 static_cast<X86::CondCode>(CCN->getAsAPIntVal().getSExtValue());
51666 // Update CC for the consumer of the flag.
51667 // The old CC is `ne`. Hence, when comparing the result with 0, we are
51668 // checking if the second condition evaluates to true. When comparing the
51669 // result with 1, we are checking uf the second condition evaluates to false.
51671 if (isNullConstant(N->getOperand(1)))
51672 Ops[CondNo] = CCN;
51673 else if (isOneConstant(N->getOperand(1)))
51674 Ops[CondNo] = DAG.getTargetConstant(OppositeCC, SDLoc(BrCond), MVT::i8);
51675 else
51676 llvm_unreachable("expect constant 0 or 1");
51677
51678 SDValue NewBrCond =
51679 DAG.getNode(X86ISD::BRCOND, SDLoc(BrCond), BrCond->getValueType(0), Ops);
51680 // Avoid self-assign error b/c CC1 can be `e/ne`.
51681 if (BrCond != NewBrCond.getNode())
51682 DCI.CombineTo(BrCond, NewBrCond);
51683 return X;
51684}
51685
51688 const X86Subtarget &ST) {
51689 // and/or(setcc(cc0, flag0), setcc(cc1, sub (X, Y)))
51690 // ->
51691 // setcc(cc1, ccmp(X, Y, ~cflags/cflags, cc0/~cc0, flag0))
51692
51693 // and/or(setcc(cc0, flag0), setcc(cc1, cmp (X, 0)))
51694 // ->
51695 // setcc(cc1, ctest(X, X, ~cflags/cflags, cc0/~cc0, flag0))
51696 //
51697 // where cflags is determined by cc1.
51698
51699 if (!ST.hasCCMP())
51700 return SDValue();
51701
51702 SDValue SetCC0 = N->getOperand(0);
51703 SDValue SetCC1 = N->getOperand(1);
51704 if (SetCC0.getOpcode() != X86ISD::SETCC ||
51705 SetCC1.getOpcode() != X86ISD::SETCC)
51706 return SDValue();
51707
51708 auto GetCombineToOpc = [&](SDValue V) -> unsigned {
51709 SDValue Op = V.getOperand(1);
51710 unsigned Opc = Op.getOpcode();
51711 if (Opc == X86ISD::SUB)
51712 return X86ISD::CCMP;
51713 if (Opc == X86ISD::CMP && isNullConstant(Op.getOperand(1)))
51714 return X86ISD::CTEST;
51715 return 0U;
51716 };
51717
51718 unsigned NewOpc = 0;
51719
51720 // AND/OR is commutable. Canonicalize the operands to make SETCC with SUB/CMP
51721 // appear on the right.
51722 if (!(NewOpc = GetCombineToOpc(SetCC1))) {
51723 std::swap(SetCC0, SetCC1);
51724 if (!(NewOpc = GetCombineToOpc(SetCC1)))
51725 return SDValue();
51726 }
51727
51728 X86::CondCode CC0 =
51729 static_cast<X86::CondCode>(SetCC0.getConstantOperandVal(0));
51730 // CCMP/CTEST is not conditional when the source condition is COND_P/COND_NP.
51731 if (CC0 == X86::COND_P || CC0 == X86::COND_NP)
51732 return SDValue();
51733
51734 bool IsOR = N->getOpcode() == ISD::OR;
51735
51736 // CMP/TEST is executed and updates the EFLAGS normally only when SrcCC
51737 // evaluates to true. So we need to inverse CC0 as SrcCC when the logic
51738 // operator is OR. Similar for CC1.
51739 SDValue SrcCC =
51741 SDLoc(SetCC0.getOperand(0)), MVT::i8)
51742 : SetCC0.getOperand(0);
51743 SDValue CC1N = SetCC1.getOperand(0);
51744 X86::CondCode CC1 =
51745 static_cast<X86::CondCode>(CC1N->getAsAPIntVal().getSExtValue());
51747 X86::CondCode CFlagsCC = IsOR ? CC1 : OppositeCC1;
51748 SDLoc DL(N);
51749 SDValue CFlags = DAG.getTargetConstant(
51750 X86::getCCMPCondFlagsFromCondCode(CFlagsCC), DL, MVT::i8);
51751 SDValue Sub = SetCC1.getOperand(1);
51752
51753 // Replace any uses of the old flag produced by SUB/CMP with the new one
51754 // produced by CCMP/CTEST.
51755 SDValue CCMP = (NewOpc == X86ISD::CCMP)
51756 ? DAG.getNode(X86ISD::CCMP, DL, MVT::i32,
51757 {Sub.getOperand(0), Sub.getOperand(1),
51758 CFlags, SrcCC, SetCC0.getOperand(1)})
51759 : DAG.getNode(X86ISD::CTEST, DL, MVT::i32,
51760 {Sub.getOperand(0), Sub.getOperand(0),
51761 CFlags, SrcCC, SetCC0.getOperand(1)});
51762
51763 return DAG.getNode(X86ISD::SETCC, DL, MVT::i8, {CC1N, CCMP});
51764}
51765
51768 const X86Subtarget &Subtarget) {
51769 using namespace SDPatternMatch;
51770
51771 SDValue N0 = N->getOperand(0);
51772 SDValue N1 = N->getOperand(1);
51773 EVT VT = N->getValueType(0);
51774 SDLoc dl(N);
51775 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51776
51777 // If this is SSE1 only convert to FAND to avoid scalarization.
51778 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
51779 return DAG.getBitcast(MVT::v4i32,
51780 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
51781 DAG.getBitcast(MVT::v4f32, N0),
51782 DAG.getBitcast(MVT::v4f32, N1)));
51783 }
51784
51785 // Use a 32-bit and+zext if upper bits known zero.
51786 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
51787 APInt HiMask = APInt::getHighBitsSet(64, 32);
51788 if (DAG.MaskedValueIsZero(N1, HiMask) ||
51789 DAG.MaskedValueIsZero(N0, HiMask)) {
51790 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
51791 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
51792 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
51793 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
51794 }
51795 }
51796
51797 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
51798 // TODO: Support multiple SrcOps.
51799 if (VT == MVT::i1) {
51801 SmallVector<APInt, 2> SrcPartials;
51802 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
51803 SrcOps.size() == 1) {
51804 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
51805 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51806 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
51807 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
51808 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
51809 if (Mask) {
51810 assert(SrcPartials[0].getBitWidth() == NumElts &&
51811 "Unexpected partial reduction mask");
51812 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
51813 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
51814 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
51815 }
51816 }
51817 }
51818
51819 // InstCombine converts:
51820 // `(-x << C0) & C1`
51821 // to
51822 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
51823 // This saves an IR instruction but on x86 the neg/shift version is preferable
51824 // so undo the transform.
51825
51826 if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
51827 // TODO: We don't actually need a splat for this, we just need the checks to
51828 // hold for each element.
51829 ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
51830 /*AllowTruncation*/ false);
51831 ConstantSDNode *N01C =
51832 isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
51833 /*AllowTruncation*/ false);
51834 if (N1C && N01C) {
51835 const APInt &MulC = N01C->getAPIntValue();
51836 const APInt &AndC = N1C->getAPIntValue();
51837 APInt MulCLowBit = MulC & (-MulC);
51838 if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
51839 (MulCLowBit + MulC).isPowerOf2()) {
51840 SDValue Neg = DAG.getNegative(N0.getOperand(0), dl, VT);
51841 int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
51842 assert(MulCLowBitLog != -1 &&
51843 "Isolated lowbit is somehow not a power of 2!");
51844 SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
51845 DAG.getConstant(MulCLowBitLog, dl, VT));
51846 return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
51847 }
51848 }
51849 }
51850
51851 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
51852 return SetCC;
51853
51854 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
51855 return V;
51856
51857 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
51858 return R;
51859
51860 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
51861 return R;
51862
51863 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
51864 return R;
51865
51866 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
51867 DAG, DCI, Subtarget))
51868 return FPLogic;
51869
51870 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
51871 return R;
51872
51873 if (DCI.isBeforeLegalizeOps())
51874 return SDValue();
51875
51876 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
51877 return R;
51878
51879 if (SDValue R = combineAndNotIntoANDNP(N, dl ,DAG))
51880 return R;
51881
51882 if (SDValue ShiftRight = combineAndMaskToShift(N, dl, DAG, Subtarget))
51883 return ShiftRight;
51884
51885 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
51886 return R;
51887
51888 if (SDValue R = combineAndNotOrIntoAndNotAnd(N, dl, DAG))
51889 return R;
51890
51891 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
51892 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
51893 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
51894 if (VT.isVector() && getTargetConstantFromNode(N1)) {
51895 unsigned Opc0 = N0.getOpcode();
51896 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
51898 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
51899 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
51900 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
51901 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
51902 }
51903 }
51904
51905 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
51906 // to make use of predicated selects.
51907 // AND(X,SEXT(SETCC())) -> SELECT(SETCC(),X,0)
51908 if (DCI.isAfterLegalizeDAG() && VT.isVector()) {
51909 SDValue X, Y;
51910 EVT CondVT = VT.changeVectorElementType(MVT::i1);
51911 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) &&
51912 (VT.is512BitVector() || Subtarget.hasVLX()) &&
51913 (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
51916 m_Value(Y), m_SpecificVT(CondVT),
51917 m_SetCC(m_Value(), m_Value(), m_Value()))))))) {
51918 return DAG.getSelect(dl, VT, Y, X,
51919 getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl));
51920 }
51921 }
51922
51923 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
51924 // avoids slow variable shift (moving shift amount to ECX etc.)
51925 if (isOneConstant(N1) && N0->hasOneUse()) {
51926 SDValue Src = N0;
51927 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
51928 Src.getOpcode() == ISD::TRUNCATE) &&
51929 Src.getOperand(0)->hasOneUse())
51930 Src = Src.getOperand(0);
51931 bool ContainsNOT = false;
51932 X86::CondCode X86CC = X86::COND_B;
51933 // Peek through AND(NOT(SRL(X,Y)),1).
51934 if (isBitwiseNot(Src)) {
51935 Src = Src.getOperand(0);
51936 X86CC = X86::COND_AE;
51937 ContainsNOT = true;
51938 }
51939 if (Src.getOpcode() == ISD::SRL &&
51940 !isa<ConstantSDNode>(Src.getOperand(1))) {
51941 SDValue BitNo = Src.getOperand(1);
51942 Src = Src.getOperand(0);
51943 // Peek through AND(SRL(NOT(X),Y),1).
51944 if (isBitwiseNot(Src)) {
51945 Src = Src.getOperand(0);
51946 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
51947 ContainsNOT = true;
51948 }
51949 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
51950 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
51951 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
51952 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
51953 }
51954 }
51955
51956 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
51957 // Attempt to recursively combine a bitmask AND with shuffles.
51958 SDValue Op(N, 0);
51959 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
51960 return Res;
51961
51962 // If either operand is a constant mask, then only the elements that aren't
51963 // zero are actually demanded by the other operand.
51964 auto GetDemandedMasks = [&](SDValue Op) {
51965 APInt UndefElts;
51966 SmallVector<APInt> EltBits;
51967 int NumElts = VT.getVectorNumElements();
51968 int EltSizeInBits = VT.getScalarSizeInBits();
51969 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
51970 APInt DemandedElts = APInt::getAllOnes(NumElts);
51971 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
51972 EltBits)) {
51973 DemandedBits.clearAllBits();
51974 DemandedElts.clearAllBits();
51975 for (int I = 0; I != NumElts; ++I) {
51976 if (UndefElts[I]) {
51977 // We can't assume an undef src element gives an undef dst - the
51978 // other src might be zero.
51979 DemandedBits.setAllBits();
51980 DemandedElts.setBit(I);
51981 } else if (!EltBits[I].isZero()) {
51982 DemandedBits |= EltBits[I];
51983 DemandedElts.setBit(I);
51984 }
51985 }
51986 }
51987 return std::make_pair(DemandedBits, DemandedElts);
51988 };
51989 APInt Bits0, Elts0;
51990 APInt Bits1, Elts1;
51991 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
51992 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
51993
51994 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
51995 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
51996 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
51997 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
51998 if (N->getOpcode() != ISD::DELETED_NODE)
51999 DCI.AddToWorklist(N);
52000 return SDValue(N, 0);
52001 }
52002
52003 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
52004 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
52005 if (NewN0 || NewN1)
52006 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
52007 NewN1 ? NewN1 : N1);
52008 }
52009
52010 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
52011 if ((VT.getScalarSizeInBits() % 8) == 0 &&
52013 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
52014 SDValue BitMask = N1;
52015 SDValue SrcVec = N0.getOperand(0);
52016 EVT SrcVecVT = SrcVec.getValueType();
52017
52018 // Check that the constant bitmask masks whole bytes.
52019 APInt UndefElts;
52020 SmallVector<APInt, 64> EltBits;
52021 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
52022 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
52023 llvm::all_of(EltBits, [](const APInt &M) {
52024 return M.isZero() || M.isAllOnes();
52025 })) {
52026 unsigned NumElts = SrcVecVT.getVectorNumElements();
52027 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
52028 unsigned Idx = N0.getConstantOperandVal(1);
52029
52030 // Create a root shuffle mask from the byte mask and the extracted index.
52031 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
52032 for (unsigned i = 0; i != Scale; ++i) {
52033 if (UndefElts[i])
52034 continue;
52035 int VecIdx = Scale * Idx + i;
52036 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
52037 }
52038
52040 {SrcVec}, 0, SrcVec.getOpcode(), SrcVec.getSimpleValueType(),
52041 ShuffleMask, {}, /*Depth=*/1, X86::MaxShuffleCombineDepth,
52042 /*AllowVariableCrossLaneMask=*/true,
52043 /*AllowVariablePerLaneMask=*/true,
52044 /*IsMaskedShuffle=*/false, DAG, SDLoc(SrcVec), Subtarget))
52045 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
52046 N0.getOperand(1));
52047 }
52048 }
52049
52050 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
52051 return R;
52052
52053 if (SDValue R = combineAndXorSubWithBMI(N, dl, DAG, Subtarget))
52054 return R;
52055
52056 return SDValue();
52057}
52058
52059// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
52061 SelectionDAG &DAG,
52062 const X86Subtarget &Subtarget) {
52063 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52064
52065 MVT VT = N->getSimpleValueType(0);
52066 unsigned EltSizeInBits = VT.getScalarSizeInBits();
52067 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
52068 return SDValue();
52069
52070 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
52071 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
52072 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
52073 return SDValue();
52074
52075 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
52076 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
52077 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
52078 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
52079 return SDValue();
52080
52081 // Attempt to extract constant byte masks.
52082 APInt UndefElts0, UndefElts1;
52083 SmallVector<APInt, 32> EltBits0, EltBits1;
52084 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
52085 /*AllowWholeUndefs*/ false,
52086 /*AllowPartialUndefs*/ false))
52087 return SDValue();
52088 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
52089 /*AllowWholeUndefs*/ false,
52090 /*AllowPartialUndefs*/ false))
52091 return SDValue();
52092
52093 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
52094 // TODO - add UNDEF elts support.
52095 if (UndefElts0[i] || UndefElts1[i])
52096 return SDValue();
52097 if (EltBits0[i] != ~EltBits1[i])
52098 return SDValue();
52099 }
52100
52101 if (useVPTERNLOG(Subtarget, VT)) {
52102 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
52103 // VPTERNLOG is only available as vXi32/64-bit types.
52104 MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64;
52105 MVT OpVT =
52106 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
52107 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
52108 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
52109 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
52110 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
52111 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
52112 DAG, Subtarget);
52113 return DAG.getBitcast(VT, Res);
52114 }
52115
52116 SDValue X = N->getOperand(0);
52117 SDValue Y =
52118 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
52119 DAG.getBitcast(VT, N1.getOperand(0)));
52120 return DAG.getNode(ISD::OR, DL, VT, X, Y);
52121}
52122
52123// Try to match OR(ANDNP(MASK,X),AND(MASK,Y)) logic pattern.
52124// TODO: Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
52125// Waiting for ANDNP combine allows other combines to happen that prevent
52126// matching.
52127static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
52128 using namespace SDPatternMatch;
52129 return sd_match(N, m_Or(m_BinOp(X86ISD::ANDNP, m_Value(Mask), m_Value(X)),
52130 m_And(m_Deferred(Mask), m_Value(Y))));
52131}
52132
52133// Try to fold:
52134// (or (and (m, y), (pandn m, x)))
52135// into:
52136// (vselect m, x, y)
52137// As a special case, try to fold:
52138// (or (and (m, (sub 0, x)), (pandn m, x)))
52139// into:
52140// (sub (xor X, M), M)
52142 SelectionDAG &DAG,
52143 const X86Subtarget &Subtarget) {
52144 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52145
52146 EVT VT = N->getValueType(0);
52147 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
52148 (VT.is256BitVector() && Subtarget.hasInt256())))
52149 return SDValue();
52150
52151 SDValue X, Y, Mask;
52152 if (!matchLogicBlend(N, X, Y, Mask))
52153 return SDValue();
52154
52155 // Validate that X, Y, and Mask are bitcasts, and see through them.
52156 Mask = peekThroughBitcasts(Mask);
52159
52160 EVT MaskVT = Mask.getValueType();
52161 unsigned EltBits = MaskVT.getScalarSizeInBits();
52162
52163 // TODO: Attempt to handle floating point cases as well?
52164 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
52165 return SDValue();
52166
52167 // Attempt to combine to conditional negate: (sub (xor X, M), M)
52168 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
52169 DAG, Subtarget))
52170 return Res;
52171
52172 // PBLENDVB is only available on SSE 4.1.
52173 if (!Subtarget.hasSSE41())
52174 return SDValue();
52175
52176 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
52177 if (Subtarget.hasVLX())
52178 return SDValue();
52179
52180 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
52181
52182 X = DAG.getBitcast(BlendVT, X);
52183 Y = DAG.getBitcast(BlendVT, Y);
52184 Mask = DAG.getBitcast(BlendVT, Mask);
52185 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
52186 return DAG.getBitcast(VT, Mask);
52187}
52188
52189// Helper function for combineOrCmpEqZeroToCtlzSrl
52190// Transforms:
52191// seteq(cmp x, 0)
52192// into:
52193// srl(ctlz x), log2(bitsize(x))
52194// Input pattern is checked by caller.
52196 SDValue Cmp = Op.getOperand(1);
52197 EVT VT = Cmp.getOperand(0).getValueType();
52198 unsigned Log2b = Log2_32(VT.getSizeInBits());
52199 SDLoc dl(Op);
52200 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
52201 // The result of the shift is true or false, and on X86, the 32-bit
52202 // encoding of shr and lzcnt is more desirable.
52203 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
52204 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
52205 DAG.getConstant(Log2b, dl, MVT::i8));
52206 return Scc;
52207}
52208
52209// Try to transform:
52210// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
52211// into:
52212// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
52213// Will also attempt to match more generic cases, eg:
52214// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
52215// Only applies if the target supports the FastLZCNT feature.
52218 const X86Subtarget &Subtarget) {
52219 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
52220 return SDValue();
52221
52222 auto isORCandidate = [](SDValue N) {
52223 return (N->getOpcode() == ISD::OR && N->hasOneUse());
52224 };
52225
52226 // Check the zero extend is extending to 32-bit or more. The code generated by
52227 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
52228 // instructions to clear the upper bits.
52229 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
52230 !isORCandidate(N->getOperand(0)))
52231 return SDValue();
52232
52233 // Check the node matches: setcc(eq, cmp 0)
52234 auto isSetCCCandidate = [](SDValue N) {
52235 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
52236 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
52237 N->getOperand(1).getOpcode() == X86ISD::CMP &&
52238 isNullConstant(N->getOperand(1).getOperand(1)) &&
52239 N->getOperand(1).getValueType().bitsGE(MVT::i32);
52240 };
52241
52242 SDNode *OR = N->getOperand(0).getNode();
52243 SDValue LHS = OR->getOperand(0);
52244 SDValue RHS = OR->getOperand(1);
52245
52246 // Save nodes matching or(or, setcc(eq, cmp 0)).
52248 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
52249 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
52250 ORNodes.push_back(OR);
52251 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
52252 LHS = OR->getOperand(0);
52253 RHS = OR->getOperand(1);
52254 }
52255
52256 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
52257 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
52258 !isORCandidate(SDValue(OR, 0)))
52259 return SDValue();
52260
52261 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
52262 // to
52263 // or(srl(ctlz),srl(ctlz)).
52264 // The dag combiner can then fold it into:
52265 // srl(or(ctlz, ctlz)).
52266 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
52267 SDValue Ret, NewRHS;
52268 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
52269 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
52270
52271 if (!Ret)
52272 return SDValue();
52273
52274 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
52275 while (!ORNodes.empty()) {
52276 OR = ORNodes.pop_back_val();
52277 LHS = OR->getOperand(0);
52278 RHS = OR->getOperand(1);
52279 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
52280 if (RHS->getOpcode() == ISD::OR)
52281 std::swap(LHS, RHS);
52282 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
52283 if (!NewRHS)
52284 return SDValue();
52285 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
52286 }
52287
52288 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
52289}
52290
52291/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52292/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52293/// with CMP+{ADC, SBB}.
52294/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
52295static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
52296 SDValue X, SDValue Y,
52297 SelectionDAG &DAG,
52298 bool ZeroSecondOpOnly = false) {
52299 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
52300 return SDValue();
52301
52302 // Look through a one-use zext.
52303 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
52304 Y = Y.getOperand(0);
52305
52306 X86::CondCode CC;
52307 SDValue EFLAGS;
52308 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
52309 CC = (X86::CondCode)Y.getConstantOperandVal(0);
52310 EFLAGS = Y.getOperand(1);
52311 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
52312 Y.hasOneUse()) {
52313 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
52314 }
52315
52316 if (!EFLAGS)
52317 return SDValue();
52318
52319 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52320 // the general case below.
52321 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
52322 if (ConstantX && !ZeroSecondOpOnly) {
52323 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
52324 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
52325 // This is a complicated way to get -1 or 0 from the carry flag:
52326 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52327 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52328 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52329 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52330 EFLAGS);
52331 }
52332
52333 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
52334 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
52335 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
52336 EFLAGS.getValueType().isInteger() &&
52337 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52338 // Swap the operands of a SUB, and we have the same pattern as above.
52339 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
52340 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
52341 SDValue NewSub = DAG.getNode(
52342 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52343 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52344 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
52345 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52346 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52347 NewEFLAGS);
52348 }
52349 }
52350 }
52351
52352 if (CC == X86::COND_B) {
52353 // X + SETB Z --> adc X, 0
52354 // X - SETB Z --> sbb X, 0
52355 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52356 DAG.getVTList(VT, MVT::i32), X,
52357 DAG.getConstant(0, DL, VT), EFLAGS);
52358 }
52359
52360 if (ZeroSecondOpOnly)
52361 return SDValue();
52362
52363 if (CC == X86::COND_A) {
52364 // Try to convert COND_A into COND_B in an attempt to facilitate
52365 // materializing "setb reg".
52366 //
52367 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
52368 // cannot take an immediate as its first operand.
52369 //
52370 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52371 EFLAGS.getValueType().isInteger() &&
52372 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52373 SDValue NewSub =
52374 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52375 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52376 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52377 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52378 DAG.getVTList(VT, MVT::i32), X,
52379 DAG.getConstant(0, DL, VT), NewEFLAGS);
52380 }
52381 }
52382
52383 if (CC == X86::COND_AE) {
52384 // X + SETAE --> sbb X, -1
52385 // X - SETAE --> adc X, -1
52386 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52387 DAG.getVTList(VT, MVT::i32), X,
52388 DAG.getAllOnesConstant(DL, VT), EFLAGS);
52389 }
52390
52391 if (CC == X86::COND_BE) {
52392 // X + SETBE --> sbb X, -1
52393 // X - SETBE --> adc X, -1
52394 // Try to convert COND_BE into COND_AE in an attempt to facilitate
52395 // materializing "setae reg".
52396 //
52397 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
52398 // cannot take an immediate as its first operand.
52399 //
52400 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52401 EFLAGS.getValueType().isInteger() &&
52402 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52403 SDValue NewSub =
52404 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52405 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52406 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52407 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52408 DAG.getVTList(VT, MVT::i32), X,
52409 DAG.getAllOnesConstant(DL, VT), NewEFLAGS);
52410 }
52411 }
52412
52413 if (CC != X86::COND_E && CC != X86::COND_NE)
52414 return SDValue();
52415
52416 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
52417 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
52418 !EFLAGS.getOperand(0).getValueType().isInteger())
52419 return SDValue();
52420
52421 SDValue Z = EFLAGS.getOperand(0);
52422 EVT ZVT = Z.getValueType();
52423
52424 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52425 // the general case below.
52426 if (ConstantX) {
52427 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
52428 // fake operands:
52429 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
52430 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
52431 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
52432 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
52433 SDValue Zero = DAG.getConstant(0, DL, ZVT);
52434 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52435 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
52436 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52437 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52438 SDValue(Neg.getNode(), 1));
52439 }
52440
52441 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
52442 // with fake operands:
52443 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
52444 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
52445 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
52446 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
52447 SDValue One = DAG.getConstant(1, DL, ZVT);
52448 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52449 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52450 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52451 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52452 Cmp1.getValue(1));
52453 }
52454 }
52455
52456 // (cmp Z, 1) sets the carry flag if Z is 0.
52457 SDValue One = DAG.getConstant(1, DL, ZVT);
52458 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52459 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52460
52461 // Add the flags type for ADC/SBB nodes.
52462 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
52463
52464 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
52465 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
52466 if (CC == X86::COND_NE)
52467 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
52468 DAG.getAllOnesConstant(DL, VT), Cmp1.getValue(1));
52469
52470 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
52471 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
52472 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
52473 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
52474}
52475
52476/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52477/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52478/// with CMP+{ADC, SBB}.
52480 SelectionDAG &DAG) {
52481 bool IsSub = N->getOpcode() == ISD::SUB;
52482 SDValue X = N->getOperand(0);
52483 SDValue Y = N->getOperand(1);
52484 EVT VT = N->getValueType(0);
52485
52486 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
52487 return ADCOrSBB;
52488
52489 // Commute and try again (negate the result for subtracts).
52490 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
52491 if (IsSub)
52492 ADCOrSBB = DAG.getNegative(ADCOrSBB, DL, VT);
52493 return ADCOrSBB;
52494 }
52495
52496 return SDValue();
52497}
52498
52499static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT,
52500 SDValue N0, SDValue N1,
52501 SelectionDAG &DAG) {
52502 assert((Opc == ISD::XOR || Opc == ISD::OR) && "Unexpected opcode");
52503
52504 // Delegate to combineAddOrSubToADCOrSBB if we have:
52505 //
52506 // (xor/or (zero_extend (setcc)) imm)
52507 //
52508 // where imm is odd if and only if we have xor, in which case the XOR/OR are
52509 // equivalent to a SUB/ADD, respectively.
52510 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
52511 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
52512 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
52513 bool IsSub = Opc == ISD::XOR;
52514 bool N1COdd = N1C->getZExtValue() & 1;
52515 if (IsSub ? N1COdd : !N1COdd)
52516 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
52517 return R;
52518 }
52519 }
52520
52521 // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
52522 if (Opc == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&
52523 N0.getOperand(0).getOpcode() == ISD::AND &&
52527 VT.getScalarSizeInBits(), /*AllowUndefs=*/true)) {
52528 return DAG.getNode(X86ISD::PCMPEQ, DL, VT, N0.getOperand(0),
52529 N0.getOperand(0).getOperand(1));
52530 }
52531
52532 return SDValue();
52533}
52534
52537 const X86Subtarget &Subtarget) {
52538 SDValue N0 = N->getOperand(0);
52539 SDValue N1 = N->getOperand(1);
52540 EVT VT = N->getValueType(0);
52541 SDLoc dl(N);
52542 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52543
52544 // If this is SSE1 only convert to FOR to avoid scalarization.
52545 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
52546 return DAG.getBitcast(MVT::v4i32,
52547 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
52548 DAG.getBitcast(MVT::v4f32, N0),
52549 DAG.getBitcast(MVT::v4f32, N1)));
52550 }
52551
52552 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
52553 // TODO: Support multiple SrcOps.
52554 if (VT == MVT::i1) {
52556 SmallVector<APInt, 2> SrcPartials;
52557 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
52558 SrcOps.size() == 1) {
52559 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
52560 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
52561 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
52562 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
52563 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
52564 if (Mask) {
52565 assert(SrcPartials[0].getBitWidth() == NumElts &&
52566 "Unexpected partial reduction mask");
52567 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
52568 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
52569 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
52570 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
52571 }
52572 }
52573 }
52574
52575 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
52576 return SetCC;
52577
52578 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
52579 return R;
52580
52581 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
52582 return R;
52583
52584 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
52585 return R;
52586
52587 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
52588 DAG, DCI, Subtarget))
52589 return FPLogic;
52590
52591 if (DCI.isBeforeLegalizeOps())
52592 return SDValue();
52593
52594 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
52595 return R;
52596
52597 if (SDValue R = canonicalizeBitSelect(N, dl, DAG, Subtarget))
52598 return R;
52599
52600 if (SDValue R = combineLogicBlendIntoPBLENDV(N, dl, DAG, Subtarget))
52601 return R;
52602
52603 // Combine `(x86isd::setcc_carry) | C` and `(0 - SetCC) | C`
52604 // into `(zext (not SetCC)) * (C + 1) - 1` if we can get a LEA out of it.
52605 if ((VT == MVT::i32 || VT == MVT::i64) && N0.hasOneUse()) {
52606 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
52607 uint64_t Val = CN->getZExtValue();
52608 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 ||
52609 Val == 8) {
52610 SDValue NotCond;
52611 if (N0.getOpcode() == X86ISD::SETCC_CARRY &&
52612 N0.getOperand(1).hasOneUse()) {
52615 NotCond = getSETCC(NewCC, N0.getOperand(1), SDLoc(N0), DAG);
52616 } else if (N0.getOpcode() == ISD::SUB &&
52617 isNullConstant(N0.getOperand(0))) {
52618 SDValue Cond = N0.getOperand(1);
52619 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
52620 Cond = Cond.getOperand(0);
52621 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
52622 X86::CondCode OldCC = (X86::CondCode)Cond.getConstantOperandVal(0);
52624 NotCond = getSETCC(NewCC, Cond.getOperand(1), SDLoc(Cond), DAG);
52625 }
52626 }
52627
52628 if (NotCond) {
52629 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
52630 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
52631 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
52632 return R;
52633 }
52634 }
52635 }
52636 }
52637
52638 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
52639 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
52640 // iff the upper elements of the non-shifted arg are zero.
52641 // KUNPCK require 16+ bool vector elements.
52642 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
52643 unsigned NumElts = VT.getVectorNumElements();
52644 unsigned HalfElts = NumElts / 2;
52645 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
52646 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
52647 N1.getConstantOperandAPInt(1) == HalfElts &&
52648 DAG.MaskedVectorIsZero(N0, UpperElts)) {
52649 return DAG.getNode(
52650 ISD::CONCAT_VECTORS, dl, VT,
52651 extractSubVector(N0, 0, DAG, dl, HalfElts),
52652 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
52653 }
52654 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
52655 N0.getConstantOperandAPInt(1) == HalfElts &&
52656 DAG.MaskedVectorIsZero(N1, UpperElts)) {
52657 return DAG.getNode(
52658 ISD::CONCAT_VECTORS, dl, VT,
52659 extractSubVector(N1, 0, DAG, dl, HalfElts),
52660 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
52661 }
52662 }
52663
52664 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
52665 // Attempt to recursively combine an OR of shuffles.
52666 SDValue Op(N, 0);
52667 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
52668 return Res;
52669
52670 // If either operand is a constant mask, then only the elements that aren't
52671 // allones are actually demanded by the other operand.
52672 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
52673 APInt UndefElts;
52674 SmallVector<APInt> EltBits;
52675 int NumElts = VT.getVectorNumElements();
52676 int EltSizeInBits = VT.getScalarSizeInBits();
52677 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
52678 return false;
52679
52680 APInt DemandedElts = APInt::getZero(NumElts);
52681 for (int I = 0; I != NumElts; ++I)
52682 if (!EltBits[I].isAllOnes())
52683 DemandedElts.setBit(I);
52684
52685 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
52686 };
52687 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
52688 if (N->getOpcode() != ISD::DELETED_NODE)
52689 DCI.AddToWorklist(N);
52690 return SDValue(N, 0);
52691 }
52692 }
52693
52694 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))
52695 return R;
52696
52697 return SDValue();
52698}
52699
52700/// Try to turn tests against the signbit in the form of:
52701/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
52702/// into:
52703/// SETGT(X, -1)
52705 SelectionDAG &DAG) {
52706 // This is only worth doing if the output type is i8 or i1.
52707 EVT ResultType = N->getValueType(0);
52708 if (ResultType != MVT::i8 && ResultType != MVT::i1)
52709 return SDValue();
52710
52711 SDValue N0 = N->getOperand(0);
52712 SDValue N1 = N->getOperand(1);
52713
52714 // We should be performing an xor against a truncated shift.
52715 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
52716 return SDValue();
52717
52718 // Make sure we are performing an xor against one.
52719 if (!isOneConstant(N1))
52720 return SDValue();
52721
52722 // SetCC on x86 zero extends so only act on this if it's a logical shift.
52723 SDValue Shift = N0.getOperand(0);
52724 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
52725 return SDValue();
52726
52727 // Make sure we are truncating from one of i16, i32 or i64.
52728 EVT ShiftTy = Shift.getValueType();
52729 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
52730 return SDValue();
52731
52732 // Make sure the shift amount extracts the sign bit.
52733 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
52734 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
52735 return SDValue();
52736
52737 // Create a greater-than comparison against -1.
52738 // N.B. Using SETGE against 0 works but we want a canonical looking
52739 // comparison, using SETGT matches up with what TranslateX86CC.
52740 SDValue ShiftOp = Shift.getOperand(0);
52741 EVT ShiftOpTy = ShiftOp.getValueType();
52742 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52743 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
52744 *DAG.getContext(), ResultType);
52745 SDValue Cond =
52746 DAG.getSetCC(DL, SetCCResultType, ShiftOp,
52747 DAG.getAllOnesConstant(DL, ShiftOpTy), ISD::SETGT);
52748 if (SetCCResultType != ResultType)
52749 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
52750 return Cond;
52751}
52752
52753/// Turn vector tests of the signbit in the form of:
52754/// xor (sra X, elt_size(X)-1), -1
52755/// into:
52756/// pcmpgt X, -1
52757///
52758/// This should be called before type legalization because the pattern may not
52759/// persist after that.
52761 const X86Subtarget &Subtarget) {
52762 EVT VT = N->getValueType(0);
52763 if (!VT.isSimple())
52764 return SDValue();
52765
52766 switch (VT.getSimpleVT().SimpleTy) {
52767 // clang-format off
52768 default: return SDValue();
52769 case MVT::v16i8:
52770 case MVT::v8i16:
52771 case MVT::v4i32:
52772 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
52773 case MVT::v32i8:
52774 case MVT::v16i16:
52775 case MVT::v8i32:
52776 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
52777 // clang-format on
52778 }
52779
52780 // There must be a shift right algebraic before the xor, and the xor must be a
52781 // 'not' operation.
52782 SDValue Shift = N->getOperand(0);
52783 SDValue Ones = N->getOperand(1);
52784 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
52786 return SDValue();
52787
52788 // The shift should be smearing the sign bit across each vector element.
52789 auto *ShiftAmt =
52790 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
52791 if (!ShiftAmt ||
52792 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
52793 return SDValue();
52794
52795 // Create a greater-than comparison against -1. We don't use the more obvious
52796 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
52797 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
52798}
52799
52800/// Detect patterns of truncation with unsigned saturation:
52801///
52802/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
52803/// Return the source value x to be truncated or SDValue() if the pattern was
52804/// not matched.
52805///
52806/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
52807/// where C1 >= 0 and C2 is unsigned max of destination type.
52808///
52809/// (truncate (smax (smin (x, C2), C1)) to dest_type)
52810/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
52811///
52812/// These two patterns are equivalent to:
52813/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
52814/// So return the smax(x, C1) value to be truncated or SDValue() if the
52815/// pattern was not matched.
52817 const SDLoc &DL) {
52818 using namespace llvm::SDPatternMatch;
52819 EVT InVT = In.getValueType();
52820
52821 // Saturation with truncation. We truncate from InVT to VT.
52823 "Unexpected types for truncate operation");
52824
52825 APInt C1, C2;
52827
52828 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
52829 // the element size of the destination type.
52830 if (sd_match(In, m_UMin(m_Value(UMin), m_ConstInt(C2))) &&
52831 C2.isMask(VT.getScalarSizeInBits()))
52832 return UMin;
52833
52834 if (sd_match(In, m_SMin(m_Value(SMin), m_ConstInt(C2))) &&
52836 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
52837 return SMin;
52838
52839 if (sd_match(In, m_SMax(m_Value(SMax), m_ConstInt(C1))) &&
52841 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) && C2.uge(C1))
52842 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
52843
52844 return SDValue();
52845}
52846
52847/// Detect patterns of truncation with signed saturation:
52848/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
52849/// signed_max_of_dest_type)) to dest_type)
52850/// or:
52851/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
52852/// signed_min_of_dest_type)) to dest_type).
52853/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
52854/// Return the source value to be truncated or SDValue() if the pattern was not
52855/// matched.
52856static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
52857 using namespace llvm::SDPatternMatch;
52858 unsigned NumDstBits = VT.getScalarSizeInBits();
52859 unsigned NumSrcBits = In.getScalarValueSizeInBits();
52860 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
52861
52862 APInt SignedMax, SignedMin;
52863 if (MatchPackUS) {
52864 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
52865 SignedMin = APInt::getZero(NumSrcBits);
52866 } else {
52867 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
52868 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
52869 }
52870
52871 SDValue SMin, SMax;
52872 if (sd_match(In, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))) &&
52873 sd_match(SMin, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))))
52874 return SMax;
52875
52876 if (sd_match(In, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))) &&
52877 sd_match(SMax, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))))
52878 return SMin;
52879
52880 return SDValue();
52881}
52882
52884 SelectionDAG &DAG,
52885 const X86Subtarget &Subtarget) {
52886 if (!Subtarget.hasSSE2() || !VT.isVector())
52887 return SDValue();
52888
52889 EVT SVT = VT.getVectorElementType();
52890 EVT InVT = In.getValueType();
52891 EVT InSVT = InVT.getVectorElementType();
52892
52893 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
52894 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
52895 // and concatenate at the same time. Then we can use a final vpmovuswb to
52896 // clip to 0-255.
52897 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
52898 InVT == MVT::v16i32 && VT == MVT::v16i8) {
52899 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52900 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
52901 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
52902 DL, DAG, Subtarget);
52903 assert(Mid && "Failed to pack!");
52904 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
52905 }
52906 }
52907
52908 // vXi32 truncate instructions are available with AVX512F.
52909 // vXi16 truncate instructions are only available with AVX512BW.
52910 // For 256-bit or smaller vectors, we require VLX.
52911 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
52912 // If the result type is 256-bits or larger and we have disable 512-bit
52913 // registers, we should go ahead and use the pack instructions if possible.
52914 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
52915 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
52916 (InVT.getSizeInBits() > 128) &&
52917 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
52918 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
52919
52920 if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
52922 (SVT == MVT::i8 || SVT == MVT::i16) &&
52923 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
52924 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52925 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
52926 if (SVT == MVT::i8 && InSVT == MVT::i32) {
52927 EVT MidVT = VT.changeVectorElementType(MVT::i16);
52928 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
52929 DAG, Subtarget);
52930 assert(Mid && "Failed to pack!");
52932 Subtarget);
52933 assert(V && "Failed to pack!");
52934 return V;
52935 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
52936 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
52937 Subtarget);
52938 }
52939 if (SDValue SSatVal = detectSSatPattern(In, VT))
52940 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
52941 Subtarget);
52942 }
52943
52944 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52945 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
52946 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
52947 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
52948 unsigned TruncOpc = 0;
52949 SDValue SatVal;
52950 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
52951 SatVal = SSatVal;
52952 TruncOpc = X86ISD::VTRUNCS;
52953 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
52954 SatVal = USatVal;
52955 TruncOpc = X86ISD::VTRUNCUS;
52956 }
52957 if (SatVal) {
52958 unsigned ResElts = VT.getVectorNumElements();
52959 // If the input type is less than 512 bits and we don't have VLX, we need
52960 // to widen to 512 bits.
52961 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
52962 unsigned NumConcats = 512 / InVT.getSizeInBits();
52963 ResElts *= NumConcats;
52964 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
52965 ConcatOps[0] = SatVal;
52966 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
52967 NumConcats * InVT.getVectorNumElements());
52968 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
52969 }
52970 // Widen the result if its narrower than 128 bits.
52971 if (ResElts * SVT.getSizeInBits() < 128)
52972 ResElts = 128 / SVT.getSizeInBits();
52973 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
52974 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
52975 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
52976 DAG.getVectorIdxConstant(0, DL));
52977 }
52978 }
52979
52980 return SDValue();
52981}
52982
52984 SelectionDAG &DAG,
52986 const X86Subtarget &Subtarget) {
52987 auto *Ld = cast<LoadSDNode>(N);
52988 EVT RegVT = Ld->getValueType(0);
52989 SDValue Ptr = Ld->getBasePtr();
52990 SDValue Chain = Ld->getChain();
52991 ISD::LoadExtType Ext = Ld->getExtensionType();
52992
52993 if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple())
52994 return SDValue();
52995
52996 if (!(RegVT.is128BitVector() || RegVT.is256BitVector()))
52997 return SDValue();
52998
53000 if (!LdC)
53001 return SDValue();
53002
53003 auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
53004 ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
53005 for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
53006 if (Undefs[I])
53007 continue;
53008 if (UserUndefs[I] || Bits[I] != UserBits[I])
53009 return false;
53010 }
53011 return true;
53012 };
53013
53014 // Look through all other loads/broadcasts in the chain for another constant
53015 // pool entry.
53016 for (SDNode *User : Chain->users()) {
53017 auto *UserLd = dyn_cast<MemSDNode>(User);
53018 if (User != N && UserLd &&
53019 (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
53020 User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
53022 UserLd->getChain() == Chain && User->hasAnyUseOfValue(0) &&
53023 User->getValueSizeInBits(0).getFixedValue() >
53024 RegVT.getFixedSizeInBits()) {
53025 EVT UserVT = User->getValueType(0);
53026 SDValue UserPtr = UserLd->getBasePtr();
53027 const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
53028
53029 // See if we are loading a constant that matches in the lower
53030 // bits of a longer constant (but from a different constant pool ptr).
53031 if (UserC && UserPtr != Ptr) {
53032 unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
53033 unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
53034 if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
53035 APInt Undefs, UserUndefs;
53036 SmallVector<APInt> Bits, UserBits;
53037 unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),
53038 UserVT.getScalarSizeInBits());
53039 if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,
53040 Bits) &&
53042 UserUndefs, UserBits)) {
53043 if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
53045 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
53046 RegVT.getSizeInBits());
53047 Extract = DAG.getBitcast(RegVT, Extract);
53048 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53049 }
53050 }
53051 }
53052 }
53053 }
53054 }
53055
53056 return SDValue();
53057}
53058
53061 const X86Subtarget &Subtarget) {
53062 auto *Ld = cast<LoadSDNode>(N);
53063 EVT RegVT = Ld->getValueType(0);
53064 EVT MemVT = Ld->getMemoryVT();
53065 SDLoc dl(Ld);
53066 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53067
53068 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
53069 // into two 16-byte operations. Also split non-temporal aligned loads on
53070 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
53071 ISD::LoadExtType Ext = Ld->getExtensionType();
53072 unsigned Fast;
53073 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
53074 Ext == ISD::NON_EXTLOAD &&
53075 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
53076 Ld->getAlign() >= Align(16)) ||
53077 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
53078 *Ld->getMemOperand(), &Fast) &&
53079 !Fast))) {
53080 unsigned NumElems = RegVT.getVectorNumElements();
53081 if (NumElems < 2)
53082 return SDValue();
53083
53084 unsigned HalfOffset = 16;
53085 SDValue Ptr1 = Ld->getBasePtr();
53086 SDValue Ptr2 =
53087 DAG.getMemBasePlusOffset(Ptr1, TypeSize::getFixed(HalfOffset), dl);
53088 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
53089 NumElems / 2);
53090 SDValue Load1 =
53091 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
53092 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53093 SDValue Load2 =
53094 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
53095 Ld->getPointerInfo().getWithOffset(HalfOffset),
53096 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53097 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
53098 Load1.getValue(1), Load2.getValue(1));
53099
53100 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
53101 return DCI.CombineTo(N, NewVec, TF, true);
53102 }
53103
53104 // Bool vector load - attempt to cast to an integer, as we have good
53105 // (vXiY *ext(vXi1 bitcast(iX))) handling.
53106 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
53107 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
53108 unsigned NumElts = RegVT.getVectorNumElements();
53109 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
53110 if (TLI.isTypeLegal(IntVT)) {
53111 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
53112 Ld->getPointerInfo(), Ld->getBaseAlign(),
53113 Ld->getMemOperand()->getFlags());
53114 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
53115 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
53116 }
53117 }
53118
53119 // If we also broadcast this vector to a wider type, then just extract the
53120 // lowest subvector.
53121 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
53122 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
53123 SDValue Ptr = Ld->getBasePtr();
53124 SDValue Chain = Ld->getChain();
53125 for (SDNode *User : Chain->users()) {
53126 auto *UserLd = dyn_cast<MemSDNode>(User);
53127 if (User != N && UserLd &&
53128 User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
53129 UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&
53130 UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&
53131 User->hasAnyUseOfValue(0) &&
53132 User->getValueSizeInBits(0).getFixedValue() >
53133 RegVT.getFixedSizeInBits()) {
53135 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
53136 RegVT.getSizeInBits());
53137 Extract = DAG.getBitcast(RegVT, Extract);
53138 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53139 }
53140 }
53141 }
53142
53143 if (SDValue V = combineConstantPoolLoads(Ld, dl, DAG, DCI, Subtarget))
53144 return V;
53145
53146 // Cast ptr32 and ptr64 pointers to the default address space before a load.
53147 unsigned AddrSpace = Ld->getAddressSpace();
53148 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53149 AddrSpace == X86AS::PTR32_UPTR) {
53150 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53151 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
53152 SDValue Cast =
53153 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
53154 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
53155 Ld->getPointerInfo(), MemVT, Ld->getBaseAlign(),
53156 Ld->getMemOperand()->getFlags());
53157 }
53158 }
53159
53160 return SDValue();
53161}
53162
53163/// If V is a build vector of boolean constants and exactly one of those
53164/// constants is true, return the operand index of that true element.
53165/// Otherwise, return -1.
53166static int getOneTrueElt(SDValue V) {
53167 // This needs to be a build vector of booleans.
53168 // TODO: Checking for the i1 type matches the IR definition for the mask,
53169 // but the mask check could be loosened to i8 or other types. That might
53170 // also require checking more than 'allOnesValue'; eg, the x86 HW
53171 // instructions only require that the MSB is set for each mask element.
53172 // The ISD::MSTORE comments/definition do not specify how the mask operand
53173 // is formatted.
53174 auto *BV = dyn_cast<BuildVectorSDNode>(V);
53175 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
53176 return -1;
53177
53178 int TrueIndex = -1;
53179 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
53180 for (unsigned i = 0; i < NumElts; ++i) {
53181 const SDValue &Op = BV->getOperand(i);
53182 if (Op.isUndef())
53183 continue;
53184 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
53185 if (!ConstNode)
53186 return -1;
53187 if (ConstNode->getAPIntValue().countr_one() >= 1) {
53188 // If we already found a one, this is too many.
53189 if (TrueIndex >= 0)
53190 return -1;
53191 TrueIndex = i;
53192 }
53193 }
53194 return TrueIndex;
53195}
53196
53197/// Given a masked memory load/store operation, return true if it has one mask
53198/// bit set. If it has one mask bit set, then also return the memory address of
53199/// the scalar element to load/store, the vector index to insert/extract that
53200/// scalar element, and the alignment for the scalar memory access.
53202 SelectionDAG &DAG, SDValue &Addr,
53203 SDValue &Index, Align &Alignment,
53204 unsigned &Offset) {
53205 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
53206 if (TrueMaskElt < 0)
53207 return false;
53208
53209 // Get the address of the one scalar element that is specified by the mask
53210 // using the appropriate offset from the base pointer.
53211 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
53212 Offset = 0;
53213 Addr = MaskedOp->getBasePtr();
53214 if (TrueMaskElt != 0) {
53215 Offset = TrueMaskElt * EltVT.getStoreSize();
53217 SDLoc(MaskedOp));
53218 }
53219
53220 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
53221 Alignment = commonAlignment(MaskedOp->getBaseAlign(), EltVT.getStoreSize());
53222 return true;
53223}
53224
53225/// If exactly one element of the mask is set for a non-extending masked load,
53226/// it is a scalar load and vector insert.
53227/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53228/// mask have already been optimized in IR, so we don't bother with those here.
53229static SDValue
53232 const X86Subtarget &Subtarget) {
53233 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53234 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53235 // However, some target hooks may need to be added to know when the transform
53236 // is profitable. Endianness would also have to be considered.
53237
53238 SDValue Addr, VecIndex;
53239 Align Alignment;
53240 unsigned Offset;
53241 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
53242 return SDValue();
53243
53244 // Load the one scalar element that is specified by the mask using the
53245 // appropriate offset from the base pointer.
53246 SDLoc DL(ML);
53247 EVT VT = ML->getValueType(0);
53248 EVT EltVT = VT.getVectorElementType();
53249
53250 EVT CastVT = VT;
53251 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53252 EltVT = MVT::f64;
53253 CastVT = VT.changeVectorElementType(EltVT);
53254 }
53255
53256 SDValue Load =
53257 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
53258 ML->getPointerInfo().getWithOffset(Offset),
53259 Alignment, ML->getMemOperand()->getFlags());
53260
53261 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
53262
53263 // Insert the loaded element into the appropriate place in the vector.
53264 SDValue Insert =
53265 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
53266 Insert = DAG.getBitcast(VT, Insert);
53267 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
53268}
53269
53270static SDValue
53273 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53274 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
53275 return SDValue();
53276
53277 SDLoc DL(ML);
53278 EVT VT = ML->getValueType(0);
53279
53280 // If we are loading the first and last elements of a vector, it is safe and
53281 // always faster to load the whole vector. Replace the masked load with a
53282 // vector load and select.
53283 unsigned NumElts = VT.getVectorNumElements();
53284 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
53285 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
53286 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
53287 if (LoadFirstElt && LoadLastElt) {
53288 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
53289 ML->getMemOperand());
53290 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
53291 ML->getPassThru());
53292 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
53293 }
53294
53295 // Convert a masked load with a constant mask into a masked load and a select.
53296 // This allows the select operation to use a faster kind of select instruction
53297 // (for example, vblendvps -> vblendps).
53298
53299 // Don't try this if the pass-through operand is already undefined. That would
53300 // cause an infinite loop because that's what we're about to create.
53301 if (ML->getPassThru().isUndef())
53302 return SDValue();
53303
53304 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
53305 return SDValue();
53306
53307 // The new masked load has an undef pass-through operand. The select uses the
53308 // original pass-through operand.
53309 SDValue NewML = DAG.getMaskedLoad(
53310 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
53311 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
53312 ML->getAddressingMode(), ML->getExtensionType());
53313 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
53314 ML->getPassThru());
53315
53316 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
53317}
53318
53321 const X86Subtarget &Subtarget) {
53322 auto *Mld = cast<MaskedLoadSDNode>(N);
53323
53324 // TODO: Expanding load with constant mask may be optimized as well.
53325 if (Mld->isExpandingLoad())
53326 return SDValue();
53327
53328 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
53329 if (SDValue ScalarLoad =
53330 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
53331 return ScalarLoad;
53332
53333 // TODO: Do some AVX512 subsets benefit from this transform?
53334 if (!Subtarget.hasAVX512())
53335 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
53336 return Blend;
53337 }
53338
53339 // If the mask value has been legalized to a non-boolean vector, try to
53340 // simplify ops leading up to it. We only demand the MSB of each lane.
53341 SDValue Mask = Mld->getMask();
53342 if (Mask.getScalarValueSizeInBits() != 1) {
53343 EVT VT = Mld->getValueType(0);
53344 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53346 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53347 if (N->getOpcode() != ISD::DELETED_NODE)
53348 DCI.AddToWorklist(N);
53349 return SDValue(N, 0);
53350 }
53351 if (SDValue NewMask =
53353 return DAG.getMaskedLoad(
53354 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
53355 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
53356 Mld->getAddressingMode(), Mld->getExtensionType());
53357 }
53358
53359 return SDValue();
53360}
53361
53362/// If exactly one element of the mask is set for a non-truncating masked store,
53363/// it is a vector extract and scalar store.
53364/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53365/// mask have already been optimized in IR, so we don't bother with those here.
53367 SelectionDAG &DAG,
53368 const X86Subtarget &Subtarget) {
53369 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53370 // However, some target hooks may need to be added to know when the transform
53371 // is profitable. Endianness would also have to be considered.
53372
53373 SDValue Addr, VecIndex;
53374 Align Alignment;
53375 unsigned Offset;
53376 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
53377 return SDValue();
53378
53379 // Extract the one scalar element that is actually being stored.
53380 SDLoc DL(MS);
53381 SDValue Value = MS->getValue();
53382 EVT VT = Value.getValueType();
53383 EVT EltVT = VT.getVectorElementType();
53384 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53385 EltVT = MVT::f64;
53386 EVT CastVT = VT.changeVectorElementType(EltVT);
53387 Value = DAG.getBitcast(CastVT, Value);
53388 }
53389 SDValue Extract =
53390 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
53391
53392 // Store that element at the appropriate offset from the base pointer.
53393 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
53395 Alignment, MS->getMemOperand()->getFlags());
53396}
53397
53400 const X86Subtarget &Subtarget) {
53402 if (Mst->isCompressingStore())
53403 return SDValue();
53404
53405 EVT VT = Mst->getValue().getValueType();
53406 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53407
53408 if (Mst->isTruncatingStore())
53409 return SDValue();
53410
53411 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
53412 return ScalarStore;
53413
53414 // If the mask value has been legalized to a non-boolean vector, try to
53415 // simplify ops leading up to it. We only demand the MSB of each lane.
53416 SDValue Mask = Mst->getMask();
53417 if (Mask.getScalarValueSizeInBits() != 1) {
53419 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53420 if (N->getOpcode() != ISD::DELETED_NODE)
53421 DCI.AddToWorklist(N);
53422 return SDValue(N, 0);
53423 }
53424 if (SDValue NewMask =
53426 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
53427 Mst->getBasePtr(), Mst->getOffset(), NewMask,
53428 Mst->getMemoryVT(), Mst->getMemOperand(),
53429 Mst->getAddressingMode());
53430 }
53431
53432 SDValue Value = Mst->getValue();
53433 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
53434 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
53435 Mst->getMemoryVT())) {
53436 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
53437 Mst->getBasePtr(), Mst->getOffset(), Mask,
53438 Mst->getMemoryVT(), Mst->getMemOperand(),
53439 Mst->getAddressingMode(), true);
53440 }
53441
53442 return SDValue();
53443}
53444
53447 const X86Subtarget &Subtarget) {
53449 EVT StVT = St->getMemoryVT();
53450 SDLoc dl(St);
53451 SDValue StoredVal = St->getValue();
53452 EVT VT = StoredVal.getValueType();
53453 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53454
53455 // Convert a store of vXi1 into a store of iX and a bitcast.
53456 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
53457 VT.getVectorElementType() == MVT::i1) {
53458
53460 StoredVal = DAG.getBitcast(NewVT, StoredVal);
53461
53462 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53463 St->getPointerInfo(), St->getBaseAlign(),
53464 St->getMemOperand()->getFlags());
53465 }
53466
53467 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
53468 // This will avoid a copy to k-register.
53469 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
53470 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
53471 StoredVal.getOperand(0).getValueType() == MVT::i8) {
53472 SDValue Val = StoredVal.getOperand(0);
53473 // We must store zeros to the unused bits.
53474 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
53475 return DAG.getStore(St->getChain(), dl, Val, St->getBasePtr(),
53476 St->getPointerInfo(), St->getBaseAlign(),
53477 St->getMemOperand()->getFlags());
53478 }
53479
53480 // Widen v2i1/v4i1 stores to v8i1.
53481 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
53482 Subtarget.hasAVX512()) {
53483 unsigned NumConcats = 8 / VT.getVectorNumElements();
53484 // We must store zeros to the unused bits.
53485 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
53486 Ops[0] = StoredVal;
53487 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
53488 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53489 St->getPointerInfo(), St->getBaseAlign(),
53490 St->getMemOperand()->getFlags());
53491 }
53492
53493 // Turn vXi1 stores of constants into a scalar store.
53494 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
53495 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
53497 // If its a v64i1 store without 64-bit support, we need two stores.
53498 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
53499 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
53500 StoredVal->ops().slice(0, 32));
53502 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
53503 StoredVal->ops().slice(32, 32));
53505
53506 SDValue Ptr0 = St->getBasePtr();
53507 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(4), dl);
53508
53509 SDValue Ch0 =
53510 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
53511 St->getBaseAlign(), St->getMemOperand()->getFlags());
53512 SDValue Ch1 = DAG.getStore(
53513 St->getChain(), dl, Hi, Ptr1, St->getPointerInfo().getWithOffset(4),
53514 St->getBaseAlign(), St->getMemOperand()->getFlags());
53515 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
53516 }
53517
53518 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
53519 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53520 St->getPointerInfo(), St->getBaseAlign(),
53521 St->getMemOperand()->getFlags());
53522 }
53523
53524 // Convert scalar fabs/fneg load-store to integer equivalents.
53525 if ((VT == MVT::f16 || VT == MVT::bf16 || VT == MVT::f32 || VT == MVT::f64) &&
53526 (StoredVal.getOpcode() == ISD::FABS ||
53527 StoredVal.getOpcode() == ISD::FNEG) &&
53528 ISD::isNormalLoad(StoredVal.getOperand(0).getNode()) &&
53529 StoredVal.hasOneUse() && StoredVal.getOperand(0).hasOneUse()) {
53530 MVT IntVT = VT.getSimpleVT().changeTypeToInteger();
53531 if (TLI.isTypeLegal(IntVT)) {
53533 unsigned SignOp = ISD::XOR;
53534 if (StoredVal.getOpcode() == ISD::FABS) {
53535 SignMask = ~SignMask;
53536 SignOp = ISD::AND;
53537 }
53538 SDValue LogicOp = DAG.getNode(
53539 SignOp, dl, IntVT, DAG.getBitcast(IntVT, StoredVal.getOperand(0)),
53540 DAG.getConstant(SignMask, dl, IntVT));
53541 return DAG.getStore(St->getChain(), dl, LogicOp, St->getBasePtr(),
53542 St->getPointerInfo(), St->getBaseAlign(),
53543 St->getMemOperand()->getFlags());
53544 }
53545 }
53546
53547 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
53548 // Sandy Bridge, perform two 16-byte stores.
53549 unsigned Fast;
53550 if (VT.is256BitVector() && StVT == VT &&
53551 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
53552 *St->getMemOperand(), &Fast) &&
53553 !Fast) {
53554 unsigned NumElems = VT.getVectorNumElements();
53555 if (NumElems < 2)
53556 return SDValue();
53557
53558 return splitVectorStore(St, DAG);
53559 }
53560
53561 // Split under-aligned vector non-temporal stores.
53562 if (St->isNonTemporal() && StVT == VT &&
53563 St->getAlign().value() < VT.getStoreSize()) {
53564 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
53565 // vectors or the legalizer can scalarize it to use MOVNTI.
53566 if (VT.is256BitVector() || VT.is512BitVector()) {
53567 unsigned NumElems = VT.getVectorNumElements();
53568 if (NumElems < 2)
53569 return SDValue();
53570 return splitVectorStore(St, DAG);
53571 }
53572
53573 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
53574 // to use MOVNTI.
53575 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
53576 MVT NTVT = Subtarget.hasSSE4A()
53577 ? MVT::v2f64
53578 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
53579 return scalarizeVectorStore(St, NTVT, DAG);
53580 }
53581 }
53582
53583 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
53584 // supported, but avx512f is by extending to v16i32 and truncating.
53585 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
53586 St->getValue().getOpcode() == ISD::TRUNCATE &&
53587 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
53588 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
53589 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
53590 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
53591 St->getValue().getOperand(0));
53592 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
53593 MVT::v16i8, St->getMemOperand());
53594 }
53595
53596 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
53597 if (!St->isTruncatingStore() &&
53598 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
53599 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
53600 StoredVal.hasOneUse() &&
53601 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
53602 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
53603 return EmitTruncSStore(IsSigned, St->getChain(),
53604 dl, StoredVal.getOperand(0), St->getBasePtr(),
53605 VT, St->getMemOperand(), DAG);
53606 }
53607
53608 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
53609 if (!St->isTruncatingStore()) {
53610 auto IsExtractedElement = [](SDValue V) {
53611 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
53612 V = V.getOperand(0);
53613 unsigned Opc = V.getOpcode();
53615 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
53616 V.getOperand(0).hasOneUse())
53617 return V.getOperand(0);
53618 return SDValue();
53619 };
53620 if (SDValue Extract = IsExtractedElement(StoredVal)) {
53621 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
53622 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
53623 SDValue Src = Trunc.getOperand(0);
53624 MVT DstVT = Trunc.getSimpleValueType();
53625 MVT SrcVT = Src.getSimpleValueType();
53626 unsigned NumSrcElts = SrcVT.getVectorNumElements();
53627 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
53628 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
53629 if (NumTruncBits == VT.getSizeInBits() &&
53630 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
53631 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
53632 TruncVT, St->getMemOperand());
53633 }
53634 }
53635 }
53636 }
53637
53638 // Optimize trunc store (of multiple scalars) to shuffle and store.
53639 // First, pack all of the elements in one place. Next, store to memory
53640 // in fewer chunks.
53641 if (St->isTruncatingStore() && VT.isVector()) {
53642 if (TLI.isTruncStoreLegal(VT, StVT)) {
53643 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
53644 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
53645 dl, Val, St->getBasePtr(),
53646 St->getMemoryVT(), St->getMemOperand(), DAG);
53647 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
53648 DAG, dl))
53649 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
53650 dl, Val, St->getBasePtr(),
53651 St->getMemoryVT(), St->getMemOperand(), DAG);
53652 }
53653
53654 return SDValue();
53655 }
53656
53657 // Cast ptr32 and ptr64 pointers to the default address space before a store.
53658 unsigned AddrSpace = St->getAddressSpace();
53659 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53660 AddrSpace == X86AS::PTR32_UPTR) {
53661 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53662 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
53663 SDValue Cast =
53664 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
53665 return DAG.getTruncStore(
53666 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
53667 St->getBaseAlign(), St->getMemOperand()->getFlags(), St->getAAInfo());
53668 }
53669 }
53670
53671 // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
53672 // store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)
53673 if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
53674 Subtarget.hasCF() && St->isSimple()) {
53675 SDValue Cmov;
53676 if (StoredVal.getOpcode() == X86ISD::CMOV)
53677 Cmov = StoredVal;
53678 else if (StoredVal.getOpcode() == ISD::TRUNCATE &&
53679 StoredVal.getOperand(0).getOpcode() == X86ISD::CMOV)
53680 Cmov = StoredVal.getOperand(0);
53681 else
53682 return SDValue();
53683
53684 auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
53685 if (!Ld || !Ld->isSimple() || Ld->getBasePtr() != St->getBasePtr())
53686 return SDValue();
53687
53688 bool InvertCC = false;
53689 SDValue V = SDValue(Ld, 0);
53690 if (V == Cmov.getOperand(1))
53691 InvertCC = true;
53692 else if (V != Cmov.getOperand(0))
53693 return SDValue();
53694
53695 SDVTList Tys = DAG.getVTList(MVT::Other);
53696 SDValue CC = Cmov.getOperand(2);
53697 SDValue Src = DAG.getAnyExtOrTrunc(Cmov.getOperand(!InvertCC), dl, VT);
53698 if (InvertCC)
53699 CC = DAG.getTargetConstant(
53702 dl, MVT::i8);
53703 SDValue Ops[] = {St->getChain(), Src, St->getBasePtr(), CC,
53704 Cmov.getOperand(3)};
53705 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, dl, Tys, Ops, VT,
53706 St->getMemOperand());
53707 }
53708
53709 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
53710 // the FP state in cases where an emms may be missing.
53711 // A preferable solution to the general problem is to figure out the right
53712 // places to insert EMMS. This qualifies as a quick hack.
53713
53714 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
53715 if (VT.getSizeInBits() != 64)
53716 return SDValue();
53717
53718 const Function &F = DAG.getMachineFunction().getFunction();
53719 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
53720 bool F64IsLegal =
53721 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
53722
53723 if (!F64IsLegal || Subtarget.is64Bit())
53724 return SDValue();
53725
53726 if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
53727 cast<LoadSDNode>(St->getValue())->isSimple() &&
53728 St->getChain().hasOneUse() && St->isSimple()) {
53729 auto *Ld = cast<LoadSDNode>(St->getValue());
53730
53731 if (!ISD::isNormalLoad(Ld))
53732 return SDValue();
53733
53734 // Avoid the transformation if there are multiple uses of the loaded value.
53735 if (!Ld->hasNUsesOfValue(1, 0))
53736 return SDValue();
53737
53738 SDLoc LdDL(Ld);
53739 SDLoc StDL(N);
53740
53741 // Remove any range metadata as we're converting to f64 load/store.
53742 Ld->getMemOperand()->clearRanges();
53743
53744 // Lower to a single movq load/store pair.
53745 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
53746 Ld->getBasePtr(), Ld->getMemOperand());
53747
53748 // Make sure new load is placed in same chain order.
53749 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
53750 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
53751 St->getMemOperand());
53752 }
53753
53754 // This is similar to the above case, but here we handle a scalar 64-bit
53755 // integer store that is extracted from a vector on a 32-bit target.
53756 // If we have SSE2, then we can treat it like a floating-point double
53757 // to get past legalization. The execution dependencies fixup pass will
53758 // choose the optimal machine instruction for the store if this really is
53759 // an integer or v2f32 rather than an f64.
53760 if (VT == MVT::i64 &&
53762 SDValue OldExtract = St->getOperand(1);
53763 SDValue ExtOp0 = OldExtract.getOperand(0);
53764 unsigned VecSize = ExtOp0.getValueSizeInBits();
53765 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
53766 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
53767 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
53768 BitCast, OldExtract.getOperand(1));
53769 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
53770 St->getPointerInfo(), St->getBaseAlign(),
53771 St->getMemOperand()->getFlags());
53772 }
53773
53774 return SDValue();
53775}
53776
53779 const X86Subtarget &Subtarget) {
53780 auto *St = cast<MemIntrinsicSDNode>(N);
53781
53782 SDValue StoredVal = N->getOperand(1);
53783 MVT VT = StoredVal.getSimpleValueType();
53784 EVT MemVT = St->getMemoryVT();
53785
53786 // Figure out which elements we demand.
53787 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
53788 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
53789
53790 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53791 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
53792 if (N->getOpcode() != ISD::DELETED_NODE)
53793 DCI.AddToWorklist(N);
53794 return SDValue(N, 0);
53795 }
53796
53797 return SDValue();
53798}
53799
53800/// Return 'true' if this vector operation is "horizontal"
53801/// and return the operands for the horizontal operation in LHS and RHS. A
53802/// horizontal operation performs the binary operation on successive elements
53803/// of its first operand, then on successive elements of its second operand,
53804/// returning the resulting values in a vector. For example, if
53805/// A = < float a0, float a1, float a2, float a3 >
53806/// and
53807/// B = < float b0, float b1, float b2, float b3 >
53808/// then the result of doing a horizontal operation on A and B is
53809/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
53810/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
53811/// A horizontal-op B, for some already available A and B, and if so then LHS is
53812/// set to A, RHS to B, and the routine returns 'true'.
53813static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
53814 SelectionDAG &DAG, const X86Subtarget &Subtarget,
53815 bool IsCommutative,
53816 SmallVectorImpl<int> &PostShuffleMask,
53817 bool ForceHorizOp) {
53818 // If either operand is undef, bail out. The binop should be simplified.
53819 if (LHS.isUndef() || RHS.isUndef())
53820 return false;
53821
53822 // Look for the following pattern:
53823 // A = < float a0, float a1, float a2, float a3 >
53824 // B = < float b0, float b1, float b2, float b3 >
53825 // and
53826 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
53827 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
53828 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
53829 // which is A horizontal-op B.
53830
53831 MVT VT = LHS.getSimpleValueType();
53832 assert((VT.is128BitVector() || VT.is256BitVector()) &&
53833 "Unsupported vector type for horizontal add/sub");
53834 unsigned NumElts = VT.getVectorNumElements();
53835
53836 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
53837 SmallVectorImpl<int> &ShuffleMask) {
53838 bool UseSubVector = false;
53839 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
53840 Op.getOperand(0).getValueType().is256BitVector() &&
53841 llvm::isNullConstant(Op.getOperand(1))) {
53842 Op = Op.getOperand(0);
53843 UseSubVector = true;
53844 }
53846 SmallVector<int, 16> SrcMask, ScaledMask;
53848 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
53849 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
53850 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
53851 })) {
53852 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
53853 if (!UseSubVector && SrcOps.size() <= 2 &&
53854 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
53855 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
53856 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
53857 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
53858 }
53859 if (UseSubVector && SrcOps.size() == 1 &&
53860 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
53861 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
53862 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
53863 ShuffleMask.assign(Mask.begin(), Mask.end());
53864 }
53865 }
53866 };
53867
53868 // View LHS in the form
53869 // LHS = VECTOR_SHUFFLE A, B, LMask
53870 // If LHS is not a shuffle, then pretend it is the identity shuffle:
53871 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
53872 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
53873 SDValue A, B;
53875 GetShuffle(LHS, A, B, LMask);
53876
53877 // Likewise, view RHS in the form
53878 // RHS = VECTOR_SHUFFLE C, D, RMask
53879 SDValue C, D;
53881 GetShuffle(RHS, C, D, RMask);
53882
53883 // At least one of the operands should be a vector shuffle.
53884 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
53885 if (NumShuffles == 0)
53886 return false;
53887
53888 if (LMask.empty()) {
53889 A = LHS;
53890 for (unsigned i = 0; i != NumElts; ++i)
53891 LMask.push_back(i);
53892 }
53893
53894 if (RMask.empty()) {
53895 C = RHS;
53896 for (unsigned i = 0; i != NumElts; ++i)
53897 RMask.push_back(i);
53898 }
53899
53900 // If we have an unary mask, ensure the other op is set to null.
53901 if (isUndefOrInRange(LMask, 0, NumElts))
53902 B = SDValue();
53903 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
53904 A = SDValue();
53905
53906 if (isUndefOrInRange(RMask, 0, NumElts))
53907 D = SDValue();
53908 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
53909 C = SDValue();
53910
53911 // If A and B occur in reverse order in RHS, then canonicalize by commuting
53912 // RHS operands and shuffle mask.
53913 if (A != C) {
53914 std::swap(C, D);
53916 }
53917 // Check that the shuffles are both shuffling the same vectors.
53918 if (!(A == C && B == D))
53919 return false;
53920
53921 PostShuffleMask.clear();
53922 PostShuffleMask.append(NumElts, SM_SentinelUndef);
53923
53924 // LHS and RHS are now:
53925 // LHS = shuffle A, B, LMask
53926 // RHS = shuffle A, B, RMask
53927 // Check that the masks correspond to performing a horizontal operation.
53928 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
53929 // so we just repeat the inner loop if this is a 256-bit op.
53930 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
53931 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
53932 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
53933 assert((NumEltsPer128BitChunk % 2 == 0) &&
53934 "Vector type should have an even number of elements in each lane");
53935 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
53936 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
53937 // Ignore undefined components.
53938 int LIdx = LMask[i + j], RIdx = RMask[i + j];
53939 if (LIdx < 0 || RIdx < 0 ||
53940 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
53941 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
53942 continue;
53943
53944 // Check that successive odd/even elements are being operated on. If not,
53945 // this is not a horizontal operation.
53946 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
53947 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
53948 return false;
53949
53950 // Compute the post-shuffle mask index based on where the element
53951 // is stored in the HOP result, and where it needs to be moved to.
53952 int Base = LIdx & ~1u;
53953 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
53954 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
53955
53956 // The low half of the 128-bit result must choose from A.
53957 // The high half of the 128-bit result must choose from B,
53958 // unless B is undef. In that case, we are always choosing from A.
53959 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
53960 Index += NumEltsPer64BitChunk;
53961 PostShuffleMask[i + j] = Index;
53962 }
53963 }
53964
53965 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
53966 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
53967
53968 bool IsIdentityPostShuffle =
53969 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
53970 if (IsIdentityPostShuffle)
53971 PostShuffleMask.clear();
53972
53973 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
53974 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
53975 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
53976 return false;
53977
53978 // If the source nodes are already used in HorizOps then always accept this.
53979 // Shuffle folding should merge these back together.
53980 auto FoundHorizUser = [&](SDNode *User) {
53981 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
53982 };
53983 ForceHorizOp =
53984 ForceHorizOp || (llvm::any_of(NewLHS->users(), FoundHorizUser) &&
53985 llvm::any_of(NewRHS->users(), FoundHorizUser));
53986
53987 // Assume a SingleSource HOP if we only shuffle one input and don't need to
53988 // shuffle the result.
53989 if (!ForceHorizOp &&
53990 !shouldUseHorizontalOp(NewLHS == NewRHS &&
53991 (NumShuffles < 2 || !IsIdentityPostShuffle),
53992 DAG, Subtarget))
53993 return false;
53994
53995 LHS = DAG.getBitcast(VT, NewLHS);
53996 RHS = DAG.getBitcast(VT, NewRHS);
53997 return true;
53998}
53999
54000// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
54002 const X86Subtarget &Subtarget) {
54003 EVT VT = N->getValueType(0);
54004 unsigned Opcode = N->getOpcode();
54005 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
54006 SmallVector<int, 8> PostShuffleMask;
54007
54008 auto MergableHorizOp = [N](unsigned HorizOpcode) {
54009 return N->hasOneUse() &&
54010 N->user_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&
54011 (N->user_begin()->getOperand(0).getOpcode() == HorizOpcode ||
54012 N->user_begin()->getOperand(1).getOpcode() == HorizOpcode);
54013 };
54014
54015 switch (Opcode) {
54016 case ISD::FADD:
54017 case ISD::FSUB:
54018 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
54019 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
54020 SDValue LHS = N->getOperand(0);
54021 SDValue RHS = N->getOperand(1);
54022 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
54023 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
54024 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
54025 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
54026 if (!PostShuffleMask.empty())
54027 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
54028 DAG.getUNDEF(VT), PostShuffleMask);
54029 return HorizBinOp;
54030 }
54031 }
54032 break;
54033 case ISD::ADD:
54034 case ISD::SUB:
54035 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
54036 VT == MVT::v16i16 || VT == MVT::v8i32)) {
54037 SDValue LHS = N->getOperand(0);
54038 SDValue RHS = N->getOperand(1);
54039 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
54040 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
54041 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
54042 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
54044 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
54045 };
54046 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
54047 {LHS, RHS}, HOpBuilder);
54048 if (!PostShuffleMask.empty())
54049 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
54050 DAG.getUNDEF(VT), PostShuffleMask);
54051 return HorizBinOp;
54052 }
54053 }
54054 break;
54055 }
54056
54057 return SDValue();
54058}
54059
54060// Try to combine the following nodes
54061// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
54062// <i32 -2147483648[float -0.000000e+00]> 0
54063// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
54064// <(load 4 from constant-pool)> t0, t29
54065// [t30: v16i32 = bitcast t27]
54066// t6: v16i32 = xor t7, t27[t30]
54067// t11: v16f32 = bitcast t6
54068// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
54069// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
54070// t22: v16f32 = bitcast t7
54071// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
54072// t24: v32f16 = bitcast t23
54074 const X86Subtarget &Subtarget) {
54075 EVT VT = N->getValueType(0);
54076 SDValue LHS = N->getOperand(0);
54077 SDValue RHS = N->getOperand(1);
54078 int CombineOpcode =
54079 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
54080 auto combineConjugation = [&](SDValue &r) {
54081 if (LHS->getOpcode() == ISD::BITCAST) {
54082 SDValue XOR = LHS.getOperand(0);
54083 if (XOR->getOpcode() == ISD::XOR) {
54084 KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
54085 if (XORRHS.isConstant()) {
54086 APInt ConjugationInt32 = APInt(32, 0x80000000);
54087 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL);
54088 if ((XORRHS.getBitWidth() == 32 &&
54089 XORRHS.getConstant() == ConjugationInt32) ||
54090 (XORRHS.getBitWidth() == 64 &&
54091 XORRHS.getConstant() == ConjugationInt64)) {
54092 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
54093 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
54094 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
54095 r = DAG.getBitcast(VT, FCMulC);
54096 return true;
54097 }
54098 }
54099 }
54100 }
54101 return false;
54102 };
54103 SDValue Res;
54104 if (combineConjugation(Res))
54105 return Res;
54106 std::swap(LHS, RHS);
54107 if (combineConjugation(Res))
54108 return Res;
54109 return Res;
54110}
54111
54112// Try to combine the following nodes:
54113// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
54115 const X86Subtarget &Subtarget) {
54116 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
54118 Flags.hasAllowContract();
54119 };
54120
54121 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
54122 return DAG.getTarget().Options.NoSignedZerosFPMath ||
54123 Flags.hasNoSignedZeros();
54124 };
54125 auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {
54126 APInt AI = APInt(32, 0x80008000);
54127 KnownBits Bits = DAG.computeKnownBits(Op);
54128 return Bits.getBitWidth() == 32 && Bits.isConstant() &&
54129 Bits.getConstant() == AI;
54130 };
54131
54132 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
54133 !AllowContract(N->getFlags()))
54134 return SDValue();
54135
54136 EVT VT = N->getValueType(0);
54137 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
54138 return SDValue();
54139
54140 SDValue LHS = N->getOperand(0);
54141 SDValue RHS = N->getOperand(1);
54142 bool IsConj;
54143 SDValue FAddOp1, MulOp0, MulOp1;
54144 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
54145 &IsVectorAllNegativeZero,
54146 &HasNoSignedZero](SDValue N) -> bool {
54147 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
54148 return false;
54149 SDValue Op0 = N.getOperand(0);
54150 unsigned Opcode = Op0.getOpcode();
54151 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
54152 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
54153 MulOp0 = Op0.getOperand(0);
54154 MulOp1 = Op0.getOperand(1);
54155 IsConj = Opcode == X86ISD::VFCMULC;
54156 return true;
54157 }
54158 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
54160 HasNoSignedZero(Op0->getFlags())) ||
54161 IsVectorAllNegativeZero(Op0->getOperand(2)))) {
54162 MulOp0 = Op0.getOperand(0);
54163 MulOp1 = Op0.getOperand(1);
54164 IsConj = Opcode == X86ISD::VFCMADDC;
54165 return true;
54166 }
54167 }
54168 return false;
54169 };
54170
54171 if (GetCFmulFrom(LHS))
54172 FAddOp1 = RHS;
54173 else if (GetCFmulFrom(RHS))
54174 FAddOp1 = LHS;
54175 else
54176 return SDValue();
54177
54178 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
54179 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
54180 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
54181 // FIXME: How do we handle when fast math flags of FADD are different from
54182 // CFMUL's?
54183 SDValue CFmul =
54184 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
54185 return DAG.getBitcast(VT, CFmul);
54186}
54187
54188/// Do target-specific dag combines on floating-point adds/subs.
54190 const X86Subtarget &Subtarget) {
54191 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
54192 return HOp;
54193
54194 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
54195 return COp;
54196
54197 return SDValue();
54198}
54199
54201 const X86Subtarget &Subtarget) {
54202 EVT VT = N->getValueType(0);
54203 SDValue Src = N->getOperand(0);
54204 EVT SrcVT = Src.getValueType();
54205 SDLoc DL(N);
54206
54207 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54208
54209 // Let legalize expand this if it isn't a legal type yet.
54210 if (!TLI.isTypeLegal(VT))
54211 return SDValue();
54212
54213 if ((SrcVT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) ||
54214 (SrcVT.getScalarType() == MVT::f32 && !Subtarget.hasDQI()))
54215 return SDValue();
54216
54217 if (SrcVT == MVT::v2f16) {
54218 SrcVT = MVT::v4f16;
54219 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54220 DAG.getUNDEF(MVT::v2f16));
54221 }
54222
54223 if (SrcVT == MVT::v4f16) {
54224 SrcVT = MVT::v8f16;
54225 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54226 DAG.getUNDEF(MVT::v4f16));
54227 } else if (SrcVT == MVT::v2f32) {
54228 SrcVT = MVT::v4f32;
54229 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54230 DAG.getUNDEF(MVT::v2f32));
54231 } else {
54232 return SDValue();
54233 }
54234
54235 return DAG.getNode(X86ISD::CVTP2SI, DL, VT, Src);
54236}
54237
54238// Attempt to fold some (truncate (srl (add/or/xor X, C1), C2)) patterns to
54239// (add/or/xor (truncate (srl X, C2)), C1'). C1' will be smaller than C1 so we
54240// are able to avoid generating code with MOVABS and large constants in certain
54241// cases.
54243 const SDLoc &DL) {
54244 assert(N.getOpcode() == ISD::SRL && "Unknown shift opcode");
54245 std::optional<unsigned> ValidSrlConst = DAG.getValidShiftAmount(N);
54246 if (!ValidSrlConst)
54247 return SDValue();
54248 unsigned SrlConstVal = *ValidSrlConst;
54249
54250 SDValue Op = N.getOperand(0);
54251 unsigned Opcode = Op.getOpcode();
54252 assert(VT == MVT::i32 && Op.getValueType() == MVT::i64 &&
54253 "Illegal truncation types");
54254
54255 if ((Opcode != ISD::ADD && Opcode != ISD::OR && Opcode != ISD::XOR) ||
54256 !isa<ConstantSDNode>(Op.getOperand(1)))
54257 return SDValue();
54258 const APInt &OpConst = Op.getConstantOperandAPInt(1);
54259
54260 if (SrlConstVal <= 32 ||
54261 (Opcode == ISD::ADD && OpConst.countr_zero() < SrlConstVal))
54262 return SDValue();
54263
54264 SDValue OpLhsSrl =
54265 DAG.getNode(ISD::SRL, DL, MVT::i64, Op.getOperand(0), N.getOperand(1));
54266 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, OpLhsSrl);
54267
54268 APInt NewOpConstVal = OpConst.lshr(SrlConstVal).trunc(VT.getSizeInBits());
54269 SDValue NewOpConst = DAG.getConstant(NewOpConstVal, DL, VT);
54270 SDValue NewOpNode = DAG.getNode(Opcode, DL, VT, Trunc, NewOpConst);
54271
54272 if (Opcode == ISD::ADD) {
54273 EVT CleanUpVT = EVT::getIntegerVT(*DAG.getContext(), 64 - SrlConstVal);
54274 return DAG.getZeroExtendInReg(NewOpNode, DL, CleanUpVT);
54275 }
54276 return NewOpNode;
54277}
54278
54279/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
54280/// the codegen.
54281/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
54282/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
54283/// anything that is guaranteed to be transformed by DAGCombiner.
54285 const X86Subtarget &Subtarget,
54286 const SDLoc &DL) {
54287 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
54288 SDValue Src = N->getOperand(0);
54289 unsigned SrcOpcode = Src.getOpcode();
54290 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54291
54292 EVT VT = N->getValueType(0);
54293 EVT SrcVT = Src.getValueType();
54294
54295 auto IsFreeTruncation = [VT](SDValue Op) {
54296 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
54297
54298 // See if this has been extended from a smaller/equal size to
54299 // the truncation size, allowing a truncation to combine with the extend.
54300 unsigned Opcode = Op.getOpcode();
54301 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
54302 Opcode == ISD::ZERO_EXTEND) &&
54303 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
54304 return true;
54305
54306 // See if this is a single use constant which can be constant folded.
54307 // NOTE: We don't peek throught bitcasts here because there is currently
54308 // no support for constant folding truncate+bitcast+vector_of_constants. So
54309 // we'll just send up with a truncate on both operands which will
54310 // get turned back into (truncate (binop)) causing an infinite loop.
54311 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54312 };
54313
54314 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
54315 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
54316 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
54317 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
54318 };
54319
54320 // Don't combine if the operation has other uses.
54321 if (!Src.hasOneUse())
54322 return SDValue();
54323
54324 if (VT == MVT::i32 && SrcVT == MVT::i64 && SrcOpcode == ISD::SRL)
54325 return combinei64TruncSrlConstant(Src, VT, DAG, DL);
54326
54327 if (!VT.isVector())
54328 return SDValue();
54329
54330 // In most cases its only worth pre-truncating if we're only facing the cost
54331 // of one truncation.
54332 // i.e. if one of the inputs will constant fold or the input is repeated.
54333 switch (SrcOpcode) {
54334 case ISD::MUL:
54335 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
54336 // better to truncate if we have the chance.
54337 if (SrcVT.getScalarType() == MVT::i64 &&
54338 TLI.isOperationLegal(SrcOpcode, VT) &&
54339 !TLI.isOperationLegal(SrcOpcode, SrcVT))
54340 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
54341 [[fallthrough]];
54342 case ISD::AND:
54343 case ISD::XOR:
54344 case ISD::OR:
54345 case ISD::ADD:
54346 case ISD::SUB: {
54347 SDValue Op0 = Src.getOperand(0);
54348 SDValue Op1 = Src.getOperand(1);
54349 if (TLI.isOperationLegal(SrcOpcode, VT) &&
54350 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
54351 return TruncateArithmetic(Op0, Op1);
54352 break;
54353 }
54354 }
54355
54356 return SDValue();
54357}
54358
54359// Try to form a MULHU or MULHS node by looking for
54360// (trunc (srl (mul ext, ext), >= 16))
54361// TODO: This is X86 specific because we want to be able to handle wide types
54362// before type legalization. But we can only do it if the vector will be
54363// legalized via widening/splitting. Type legalization can't handle promotion
54364// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
54365// combiner.
54366static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
54367 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
54368 using namespace llvm::SDPatternMatch;
54369
54370 if (!Subtarget.hasSSE2())
54371 return SDValue();
54372
54373 // Only handle vXi16 types that are at least 128-bits unless they will be
54374 // widened.
54375 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
54376 return SDValue();
54377
54378 // Input type should be at least vXi32.
54379 EVT InVT = Src.getValueType();
54380 if (InVT.getVectorElementType().getSizeInBits() < 32)
54381 return SDValue();
54382
54383 // First instruction should be a right shift by 16 of a multiply.
54384 SDValue LHS, RHS;
54385 APInt ShiftAmt;
54386 if (!sd_match(Src,
54387 m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_ConstInt(ShiftAmt))))
54388 return SDValue();
54389
54390 if (ShiftAmt.ult(16) || ShiftAmt.uge(InVT.getScalarSizeInBits()))
54391 return SDValue();
54392
54393 uint64_t AdditionalShift = ShiftAmt.getZExtValue() - 16;
54394
54395 // Count leading sign/zero bits on both inputs - if there are enough then
54396 // truncation back to vXi16 will be cheap - either as a pack/shuffle
54397 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
54398 // truncations may actually be free by peeking through to the ext source.
54399 auto IsSext = [&DAG](SDValue V) {
54400 return DAG.ComputeMaxSignificantBits(V) <= 16;
54401 };
54402 auto IsZext = [&DAG](SDValue V) {
54403 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
54404 };
54405
54406 bool IsSigned = IsSext(LHS) && IsSext(RHS);
54407 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
54408 if (!IsSigned && !IsUnsigned)
54409 return SDValue();
54410
54411 // Check if both inputs are extensions, which will be removed by truncation.
54412 auto isOpTruncateFree = [](SDValue Op) {
54413 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
54414 Op.getOpcode() == ISD::ZERO_EXTEND)
54415 return Op.getOperand(0).getScalarValueSizeInBits() <= 16;
54416 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54417 };
54418 bool IsTruncateFree = isOpTruncateFree(LHS) && isOpTruncateFree(RHS);
54419
54420 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
54421 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
54422 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
54423 // will have to split anyway.
54424 unsigned InSizeInBits = InVT.getSizeInBits();
54425 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
54426 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
54427 (InSizeInBits % 16) == 0) {
54428 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54429 InVT.getSizeInBits() / 16);
54430 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
54431 DAG.getBitcast(BCVT, RHS));
54432 Res = DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
54433 return DAG.getNode(ISD::SRL, DL, VT, Res,
54434 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54435 }
54436
54437 // Truncate back to source type.
54438 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
54439 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
54440
54441 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
54442 SDValue Res = DAG.getNode(Opc, DL, VT, LHS, RHS);
54443 return DAG.getNode(ISD::SRL, DL, VT, Res,
54444 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54445}
54446
54447// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
54448// from one vector with signed bytes from another vector, adds together
54449// adjacent pairs of 16-bit products, and saturates the result before
54450// truncating to 16-bits.
54451//
54452// Which looks something like this:
54453// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
54454// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
54456 const X86Subtarget &Subtarget,
54457 const SDLoc &DL) {
54458 if (!VT.isVector() || !Subtarget.hasSSSE3())
54459 return SDValue();
54460
54461 unsigned NumElems = VT.getVectorNumElements();
54462 EVT ScalarVT = VT.getVectorElementType();
54463 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
54464 return SDValue();
54465
54466 SDValue SSatVal = detectSSatPattern(In, VT);
54467 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
54468 return SDValue();
54469
54470 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
54471 // of multiplies from even/odd elements.
54472 SDValue N0 = SSatVal.getOperand(0);
54473 SDValue N1 = SSatVal.getOperand(1);
54474
54475 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
54476 return SDValue();
54477
54478 SDValue N00 = N0.getOperand(0);
54479 SDValue N01 = N0.getOperand(1);
54480 SDValue N10 = N1.getOperand(0);
54481 SDValue N11 = N1.getOperand(1);
54482
54483 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
54484 // Canonicalize zero_extend to LHS.
54485 if (N01.getOpcode() == ISD::ZERO_EXTEND)
54486 std::swap(N00, N01);
54487 if (N11.getOpcode() == ISD::ZERO_EXTEND)
54488 std::swap(N10, N11);
54489
54490 // Ensure we have a zero_extend and a sign_extend.
54491 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
54492 N01.getOpcode() != ISD::SIGN_EXTEND ||
54493 N10.getOpcode() != ISD::ZERO_EXTEND ||
54494 N11.getOpcode() != ISD::SIGN_EXTEND)
54495 return SDValue();
54496
54497 // Peek through the extends.
54498 N00 = N00.getOperand(0);
54499 N01 = N01.getOperand(0);
54500 N10 = N10.getOperand(0);
54501 N11 = N11.getOperand(0);
54502
54503 // Ensure the extend is from vXi8.
54504 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
54505 N01.getValueType().getVectorElementType() != MVT::i8 ||
54506 N10.getValueType().getVectorElementType() != MVT::i8 ||
54507 N11.getValueType().getVectorElementType() != MVT::i8)
54508 return SDValue();
54509
54510 // All inputs should be build_vectors.
54511 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
54512 N01.getOpcode() != ISD::BUILD_VECTOR ||
54513 N10.getOpcode() != ISD::BUILD_VECTOR ||
54515 return SDValue();
54516
54517 // N00/N10 are zero extended. N01/N11 are sign extended.
54518
54519 // For each element, we need to ensure we have an odd element from one vector
54520 // multiplied by the odd element of another vector and the even element from
54521 // one of the same vectors being multiplied by the even element from the
54522 // other vector. So we need to make sure for each element i, this operator
54523 // is being performed:
54524 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
54525 SDValue ZExtIn, SExtIn;
54526 for (unsigned i = 0; i != NumElems; ++i) {
54527 SDValue N00Elt = N00.getOperand(i);
54528 SDValue N01Elt = N01.getOperand(i);
54529 SDValue N10Elt = N10.getOperand(i);
54530 SDValue N11Elt = N11.getOperand(i);
54531 // TODO: Be more tolerant to undefs.
54532 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54533 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54534 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54536 return SDValue();
54537 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
54538 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
54539 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
54540 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
54541 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
54542 return SDValue();
54543 unsigned IdxN00 = ConstN00Elt->getZExtValue();
54544 unsigned IdxN01 = ConstN01Elt->getZExtValue();
54545 unsigned IdxN10 = ConstN10Elt->getZExtValue();
54546 unsigned IdxN11 = ConstN11Elt->getZExtValue();
54547 // Add is commutative so indices can be reordered.
54548 if (IdxN00 > IdxN10) {
54549 std::swap(IdxN00, IdxN10);
54550 std::swap(IdxN01, IdxN11);
54551 }
54552 // N0 indices be the even element. N1 indices must be the next odd element.
54553 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
54554 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
54555 return SDValue();
54556 SDValue N00In = N00Elt.getOperand(0);
54557 SDValue N01In = N01Elt.getOperand(0);
54558 SDValue N10In = N10Elt.getOperand(0);
54559 SDValue N11In = N11Elt.getOperand(0);
54560 // First time we find an input capture it.
54561 if (!ZExtIn) {
54562 ZExtIn = N00In;
54563 SExtIn = N01In;
54564 }
54565 if (ZExtIn != N00In || SExtIn != N01In ||
54566 ZExtIn != N10In || SExtIn != N11In)
54567 return SDValue();
54568 }
54569
54570 auto ExtractVec = [&DAG, &DL, NumElems](SDValue &Ext) {
54571 EVT ExtVT = Ext.getValueType();
54572 if (ExtVT.getVectorNumElements() != NumElems * 2) {
54573 MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2);
54574 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext,
54575 DAG.getVectorIdxConstant(0, DL));
54576 }
54577 };
54578 ExtractVec(ZExtIn);
54579 ExtractVec(SExtIn);
54580
54581 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
54583 // Shrink by adding truncate nodes and let DAGCombine fold with the
54584 // sources.
54585 EVT InVT = Ops[0].getValueType();
54586 assert(InVT.getScalarType() == MVT::i8 &&
54587 "Unexpected scalar element type");
54588 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
54589 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54590 InVT.getVectorNumElements() / 2);
54591 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
54592 };
54593 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
54594 PMADDBuilder);
54595}
54596
54598 const X86Subtarget &Subtarget) {
54599 EVT VT = N->getValueType(0);
54600 SDValue Src = N->getOperand(0);
54601 SDLoc DL(N);
54602
54603 // Attempt to pre-truncate inputs to arithmetic ops instead.
54604 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
54605 return V;
54606
54607 // Try to detect PMADD
54608 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
54609 return PMAdd;
54610
54611 // Try to combine truncation with signed/unsigned saturation.
54612 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
54613 return Val;
54614
54615 // Try to combine PMULHUW/PMULHW for vXi16.
54616 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
54617 return V;
54618
54619 // The bitcast source is a direct mmx result.
54620 // Detect bitcasts between i32 to x86mmx
54621 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
54622 SDValue BCSrc = Src.getOperand(0);
54623 if (BCSrc.getValueType() == MVT::x86mmx)
54624 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
54625 }
54626
54627 return SDValue();
54628}
54629
54632 EVT VT = N->getValueType(0);
54633 SDValue In = N->getOperand(0);
54634 SDLoc DL(N);
54635
54636 if (SDValue SSatVal = detectSSatPattern(In, VT))
54637 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
54638 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
54639 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
54640
54641 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54642 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
54643 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54644 return SDValue(N, 0);
54645
54646 return SDValue();
54647}
54648
54649/// Returns the negated value if the node \p N flips sign of FP value.
54650///
54651/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
54652/// or FSUB(0, x)
54653/// AVX512F does not have FXOR, so FNEG is lowered as
54654/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
54655/// In this case we go though all bitcasts.
54656/// This also recognizes splat of a negated value and returns the splat of that
54657/// value.
54658static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
54659 if (N->getOpcode() == ISD::FNEG)
54660 return N->getOperand(0);
54661
54662 // Don't recurse exponentially.
54664 return SDValue();
54665
54666 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
54667
54669 EVT VT = Op->getValueType(0);
54670
54671 // Make sure the element size doesn't change.
54672 if (VT.getScalarSizeInBits() != ScalarSize)
54673 return SDValue();
54674
54675 unsigned Opc = Op.getOpcode();
54676 switch (Opc) {
54677 case ISD::VECTOR_SHUFFLE: {
54678 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
54679 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
54680 if (!Op.getOperand(1).isUndef())
54681 return SDValue();
54682 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
54683 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
54684 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
54685 cast<ShuffleVectorSDNode>(Op)->getMask());
54686 break;
54687 }
54689 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
54690 // -V, INDEX).
54691 SDValue InsVector = Op.getOperand(0);
54692 SDValue InsVal = Op.getOperand(1);
54693 if (!InsVector.isUndef())
54694 return SDValue();
54695 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
54696 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
54697 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
54698 NegInsVal, Op.getOperand(2));
54699 break;
54700 }
54701 case ISD::FSUB:
54702 case ISD::XOR:
54703 case X86ISD::FXOR: {
54704 SDValue Op1 = Op.getOperand(1);
54705 SDValue Op0 = Op.getOperand(0);
54706
54707 // For XOR and FXOR, we want to check if constant
54708 // bits of Op1 are sign bit masks. For FSUB, we
54709 // have to check if constant bits of Op0 are sign
54710 // bit masks and hence we swap the operands.
54711 if (Opc == ISD::FSUB)
54712 std::swap(Op0, Op1);
54713
54714 APInt UndefElts;
54715 SmallVector<APInt, 16> EltBits;
54716 // Extract constant bits and see if they are all
54717 // sign bit masks. Ignore the undef elements.
54718 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
54719 /* AllowWholeUndefs */ true,
54720 /* AllowPartialUndefs */ false)) {
54721 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
54722 if (!UndefElts[I] && !EltBits[I].isSignMask())
54723 return SDValue();
54724
54725 // Only allow bitcast from correctly-sized constant.
54726 Op0 = peekThroughBitcasts(Op0);
54727 if (Op0.getScalarValueSizeInBits() == ScalarSize)
54728 return Op0;
54729 }
54730 break;
54731 } // case
54732 } // switch
54733
54734 return SDValue();
54735}
54736
54737static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
54738 bool NegRes) {
54739 if (NegMul) {
54740 switch (Opcode) {
54741 // clang-format off
54742 default: llvm_unreachable("Unexpected opcode");
54743 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
54744 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
54745 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
54746 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
54747 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
54748 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
54749 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
54750 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
54751 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
54752 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
54753 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
54754 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
54755 // clang-format on
54756 }
54757 }
54758
54759 if (NegAcc) {
54760 switch (Opcode) {
54761 // clang-format off
54762 default: llvm_unreachable("Unexpected opcode");
54763 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
54764 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
54765 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54766 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
54767 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
54768 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54769 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
54770 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
54771 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54772 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
54773 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
54774 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54775 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
54776 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
54777 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
54778 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
54779 // clang-format on
54780 }
54781 }
54782
54783 if (NegRes) {
54784 switch (Opcode) {
54785 // For accuracy reason, we never combine fneg and fma under strict FP.
54786 // clang-format off
54787 default: llvm_unreachable("Unexpected opcode");
54788 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
54789 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54790 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
54791 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54792 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
54793 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54794 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
54795 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54796 // clang-format on
54797 }
54798 }
54799
54800 return Opcode;
54801}
54802
54803/// Do target-specific dag combines on floating point negations.
54806 const X86Subtarget &Subtarget) {
54807 EVT OrigVT = N->getValueType(0);
54808 SDValue Arg = isFNEG(DAG, N);
54809 if (!Arg)
54810 return SDValue();
54811
54812 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54813 EVT VT = Arg.getValueType();
54814 EVT SVT = VT.getScalarType();
54815 SDLoc DL(N);
54816
54817 // Let legalize expand this if it isn't a legal type yet.
54818 if (!TLI.isTypeLegal(VT))
54819 return SDValue();
54820
54821 // If we're negating a FMUL node on a target with FMA, then we can avoid the
54822 // use of a constant by performing (-0 - A*B) instead.
54823 // FIXME: Check rounding control flags as well once it becomes available.
54824 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
54825 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
54826 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
54827 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
54828 Arg.getOperand(1), Zero);
54829 return DAG.getBitcast(OrigVT, NewNode);
54830 }
54831
54833 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54834 if (SDValue NegArg =
54835 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
54836 return DAG.getBitcast(OrigVT, NegArg);
54837
54838 return SDValue();
54839}
54840
54842 bool LegalOperations,
54843 bool ForCodeSize,
54845 unsigned Depth) const {
54846 // fneg patterns are removable even if they have multiple uses.
54847 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
54849 return DAG.getBitcast(Op.getValueType(), Arg);
54850 }
54851
54852 EVT VT = Op.getValueType();
54853 EVT SVT = VT.getScalarType();
54854 unsigned Opc = Op.getOpcode();
54855 SDNodeFlags Flags = Op.getNode()->getFlags();
54856 switch (Opc) {
54857 case ISD::FMA:
54858 case X86ISD::FMSUB:
54859 case X86ISD::FNMADD:
54860 case X86ISD::FNMSUB:
54861 case X86ISD::FMADD_RND:
54862 case X86ISD::FMSUB_RND:
54863 case X86ISD::FNMADD_RND:
54864 case X86ISD::FNMSUB_RND: {
54865 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
54866 !(SVT == MVT::f32 || SVT == MVT::f64) ||
54868 break;
54869
54870 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
54871 // if it may have signed zeros.
54872 if (!Flags.hasNoSignedZeros())
54873 break;
54874
54875 // Because getCheaperNegatedExpression can delete nodes we need a handle to
54876 // keep temporary nodes alive.
54877 std::list<HandleSDNode> Handles;
54878
54879 // This is always negatible for free but we might be able to remove some
54880 // extra operand negations as well.
54881 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
54882 for (int i = 0; i != 3; ++i) {
54883 NewOps[i] = getCheaperNegatedExpression(
54884 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
54885 if (!!NewOps[i])
54886 Handles.emplace_back(NewOps[i]);
54887 }
54888
54889 bool NegA = !!NewOps[0];
54890 bool NegB = !!NewOps[1];
54891 bool NegC = !!NewOps[2];
54892 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
54893
54894 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
54896
54897 // Fill in the non-negated ops with the original values.
54898 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
54899 if (!NewOps[i])
54900 NewOps[i] = Op.getOperand(i);
54901 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
54902 }
54903 case X86ISD::FRCP:
54904 if (SDValue NegOp0 =
54905 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
54906 ForCodeSize, Cost, Depth + 1))
54907 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
54908 break;
54909 }
54910
54911 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
54912 ForCodeSize, Cost, Depth);
54913}
54914
54916 const X86Subtarget &Subtarget) {
54917 MVT VT = N->getSimpleValueType(0);
54918 // If we have integer vector types available, use the integer opcodes.
54919 if (!VT.isVector() || !Subtarget.hasSSE2())
54920 return SDValue();
54921
54922 SDLoc dl(N);
54924 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
54925 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
54926 unsigned IntOpcode;
54927 switch (N->getOpcode()) {
54928 // clang-format off
54929 default: llvm_unreachable("Unexpected FP logic op");
54930 case X86ISD::FOR: IntOpcode = ISD::OR; break;
54931 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
54932 case X86ISD::FAND: IntOpcode = ISD::AND; break;
54933 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
54934 // clang-format on
54935 }
54936 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
54937 return DAG.getBitcast(VT, IntOp);
54938}
54939
54940/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
54942 if (N->getOpcode() != ISD::XOR)
54943 return SDValue();
54944
54945 SDValue LHS = N->getOperand(0);
54946 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
54947 return SDValue();
54948
54950 X86::CondCode(LHS->getConstantOperandVal(0)));
54951 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
54952}
54953
54955 const X86Subtarget &Subtarget) {
54956 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
54957 "Invalid opcode for combing with CTLZ");
54958 if (Subtarget.hasFastLZCNT())
54959 return SDValue();
54960
54961 EVT VT = N->getValueType(0);
54962 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
54963 (VT != MVT::i64 || !Subtarget.is64Bit()))
54964 return SDValue();
54965
54966 SDValue N0 = N->getOperand(0);
54967 SDValue N1 = N->getOperand(1);
54968
54969 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
54971 return SDValue();
54972
54973 SDValue OpCTLZ;
54974 SDValue OpSizeTM1;
54975
54976 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
54977 OpCTLZ = N1;
54978 OpSizeTM1 = N0;
54979 } else if (N->getOpcode() == ISD::SUB) {
54980 return SDValue();
54981 } else {
54982 OpCTLZ = N0;
54983 OpSizeTM1 = N1;
54984 }
54985
54986 if (!OpCTLZ.hasOneUse())
54987 return SDValue();
54988 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
54989 if (!C)
54990 return SDValue();
54991
54992 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
54993 return SDValue();
54994 EVT OpVT = VT;
54995 SDValue Op = OpCTLZ.getOperand(0);
54996 if (VT == MVT::i8) {
54997 // Zero extend to i32 since there is not an i8 bsr.
54998 OpVT = MVT::i32;
54999 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
55000 }
55001
55002 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
55003 Op = DAG.getNode(X86ISD::BSR, DL, VTs, DAG.getUNDEF(OpVT), Op);
55004 if (VT == MVT::i8)
55005 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
55006
55007 return Op;
55008}
55009
55012 const X86Subtarget &Subtarget) {
55013 SDValue N0 = N->getOperand(0);
55014 SDValue N1 = N->getOperand(1);
55015 EVT VT = N->getValueType(0);
55016 SDLoc DL(N);
55017
55018 // If this is SSE1 only convert to FXOR to avoid scalarization.
55019 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
55020 return DAG.getBitcast(MVT::v4i32,
55021 DAG.getNode(X86ISD::FXOR, DL, MVT::v4f32,
55022 DAG.getBitcast(MVT::v4f32, N0),
55023 DAG.getBitcast(MVT::v4f32, N1)));
55024 }
55025
55026 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
55027 return Cmp;
55028
55029 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), DL, N0, N1, DAG))
55030 return R;
55031
55032 if (SDValue R = combineBitOpWithShift(N->getOpcode(), DL, VT, N0, N1, DAG))
55033 return R;
55034
55035 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), DL, VT, N0, N1, DAG))
55036 return R;
55037
55038 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), DL, VT, N0, N1,
55039 DAG, DCI, Subtarget))
55040 return FPLogic;
55041
55042 if (SDValue R = combineXorSubCTLZ(N, DL, DAG, Subtarget))
55043 return R;
55044
55045 if (DCI.isBeforeLegalizeOps())
55046 return SDValue();
55047
55048 if (SDValue SetCC = foldXor1SetCC(N, DL, DAG))
55049 return SetCC;
55050
55051 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), DL, VT, N0, N1, DAG))
55052 return R;
55053
55054 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DL, DAG))
55055 return RV;
55056
55057 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
55058 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55059 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
55060 N0.getOperand(0).getValueType().isVector() &&
55061 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55062 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
55063 return DAG.getBitcast(
55064 VT, DAG.getNOT(DL, N0.getOperand(0), N0.getOperand(0).getValueType()));
55065 }
55066
55067 // Handle AVX512 mask widening.
55068 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
55069 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
55070 VT.getVectorElementType() == MVT::i1 &&
55072 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
55073 return DAG.getNode(
55075 DAG.getNOT(DL, N0.getOperand(1), N0.getOperand(1).getValueType()),
55076 N0.getOperand(2));
55077 }
55078
55079 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
55080 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
55081 // TODO: Under what circumstances could this be performed in DAGCombine?
55082 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
55083 N0.getOperand(0).getOpcode() == N->getOpcode()) {
55084 SDValue TruncExtSrc = N0.getOperand(0);
55085 auto *N1C = dyn_cast<ConstantSDNode>(N1);
55086 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
55087 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
55088 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
55089 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
55090 return DAG.getNode(ISD::XOR, DL, VT, LHS,
55091 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
55092 }
55093 }
55094
55095 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
55096 return R;
55097
55098 return combineFneg(N, DAG, DCI, Subtarget);
55099}
55100
55103 const X86Subtarget &Subtarget) {
55104 SDValue N0 = N->getOperand(0);
55105 EVT VT = N->getValueType(0);
55106
55107 // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
55108 if (VT.isInteger() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
55109 SDValue Src = N0.getOperand(0);
55110 EVT SrcVT = Src.getValueType();
55111 if (SrcVT.isVector() && SrcVT.getScalarType() == MVT::i1 &&
55112 (DCI.isBeforeLegalize() ||
55113 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) &&
55114 Subtarget.hasSSSE3()) {
55115 unsigned NumElts = SrcVT.getVectorNumElements();
55116 SmallVector<int, 32> ReverseMask(NumElts);
55117 for (unsigned I = 0; I != NumElts; ++I)
55118 ReverseMask[I] = (NumElts - 1) - I;
55119 SDValue Rev =
55120 DAG.getVectorShuffle(SrcVT, SDLoc(N), Src, Src, ReverseMask);
55121 return DAG.getBitcast(VT, Rev);
55122 }
55123 }
55124
55125 return SDValue();
55126}
55127
55128// Various combines to try to convert to avgceilu.
55131 const X86Subtarget &Subtarget) {
55132 unsigned Opcode = N->getOpcode();
55133 SDValue N0 = N->getOperand(0);
55134 SDValue N1 = N->getOperand(1);
55135 EVT VT = N->getValueType(0);
55136 EVT SVT = VT.getScalarType();
55137 SDLoc DL(N);
55138
55139 // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y)))
55140 // Only useful on vXi8 which doesn't have good SRA handling.
55141 if (Opcode == ISD::AVGCEILS && VT.isVector() && SVT == MVT::i8) {
55143 SDValue SignMask = DAG.getConstant(SignBit, DL, VT);
55144 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, SignMask);
55145 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, SignMask);
55146 return DAG.getNode(ISD::XOR, DL, VT,
55147 DAG.getNode(ISD::AVGCEILU, DL, VT, N0, N1), SignMask);
55148 }
55149
55150 return SDValue();
55151}
55152
55155 const X86Subtarget &Subtarget) {
55156 EVT VT = N->getValueType(0);
55157 unsigned NumBits = VT.getSizeInBits();
55158
55159 // TODO - Constant Folding.
55160
55161 // Simplify the inputs.
55162 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55163 APInt DemandedMask(APInt::getAllOnes(NumBits));
55164 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
55165 return SDValue(N, 0);
55166
55167 return SDValue();
55168}
55169
55171 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
55172}
55173
55174/// If a value is a scalar FP zero or a vector FP zero (potentially including
55175/// undefined elements), return a zero constant that may be used to fold away
55176/// that value. In the case of a vector, the returned constant will not contain
55177/// undefined elements even if the input parameter does. This makes it suitable
55178/// to be used as a replacement operand with operations (eg, bitwise-and) where
55179/// an undef should not propagate.
55181 const X86Subtarget &Subtarget) {
55183 return SDValue();
55184
55185 if (V.getValueType().isVector())
55186 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
55187
55188 return V;
55189}
55190
55192 const X86Subtarget &Subtarget) {
55193 SDValue N0 = N->getOperand(0);
55194 SDValue N1 = N->getOperand(1);
55195 EVT VT = N->getValueType(0);
55196 SDLoc DL(N);
55197
55198 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
55199 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
55200 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
55201 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
55202 return SDValue();
55203
55204 auto isAllOnesConstantFP = [](SDValue V) {
55205 if (V.getSimpleValueType().isVector())
55206 return ISD::isBuildVectorAllOnes(V.getNode());
55207 auto *C = dyn_cast<ConstantFPSDNode>(V);
55208 return C && C->getConstantFPValue()->isAllOnesValue();
55209 };
55210
55211 // fand (fxor X, -1), Y --> fandn X, Y
55212 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
55213 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
55214
55215 // fand X, (fxor Y, -1) --> fandn Y, X
55216 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
55217 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
55218
55219 return SDValue();
55220}
55221
55222/// Do target-specific dag combines on X86ISD::FAND nodes.
55224 const X86Subtarget &Subtarget) {
55225 // FAND(0.0, x) -> 0.0
55226 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
55227 return V;
55228
55229 // FAND(x, 0.0) -> 0.0
55230 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55231 return V;
55232
55233 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
55234 return V;
55235
55236 return lowerX86FPLogicOp(N, DAG, Subtarget);
55237}
55238
55239/// Do target-specific dag combines on X86ISD::FANDN nodes.
55241 const X86Subtarget &Subtarget) {
55242 // FANDN(0.0, x) -> x
55243 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55244 return N->getOperand(1);
55245
55246 // FANDN(x, 0.0) -> 0.0
55247 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55248 return V;
55249
55250 return lowerX86FPLogicOp(N, DAG, Subtarget);
55251}
55252
55253/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
55256 const X86Subtarget &Subtarget) {
55257 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
55258
55259 // F[X]OR(0.0, x) -> x
55260 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55261 return N->getOperand(1);
55262
55263 // F[X]OR(x, 0.0) -> x
55264 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
55265 return N->getOperand(0);
55266
55267 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
55268 return NewVal;
55269
55270 return lowerX86FPLogicOp(N, DAG, Subtarget);
55271}
55272
55273/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
55275 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
55276
55277 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
55278 if (!DAG.getTarget().Options.NoNaNsFPMath ||
55280 return SDValue();
55281
55282 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
55283 // into FMINC and FMAXC, which are Commutative operations.
55284 unsigned NewOp = 0;
55285 switch (N->getOpcode()) {
55286 default: llvm_unreachable("unknown opcode");
55287 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
55288 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
55289 }
55290
55291 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
55292 N->getOperand(0), N->getOperand(1));
55293}
55294
55296 const X86Subtarget &Subtarget) {
55297 EVT VT = N->getValueType(0);
55298 if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
55299 return SDValue();
55300
55301 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55302
55303 auto IsMinMaxLegal = [&](EVT VT) {
55304 if (!TLI.isTypeLegal(VT))
55305 return false;
55306 return VT.getScalarType() != MVT::f16 ||
55307 (Subtarget.hasFP16() && (VT == MVT::v32f16 || Subtarget.hasVLX()));
55308 };
55309
55310 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
55311 (Subtarget.hasSSE2() && VT == MVT::f64) ||
55312 (Subtarget.hasFP16() && VT == MVT::f16) ||
55313 (VT.isVector() && IsMinMaxLegal(VT))))
55314 return SDValue();
55315
55316 SDValue Op0 = N->getOperand(0);
55317 SDValue Op1 = N->getOperand(1);
55318 SDLoc DL(N);
55319 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
55320
55321 // If we don't have to respect NaN inputs, this is a direct translation to x86
55322 // min/max instructions.
55323 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
55324 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55325
55326 // If one of the operands is known non-NaN use the native min/max instructions
55327 // with the non-NaN input as second operand.
55328 if (DAG.isKnownNeverNaN(Op1))
55329 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55330 if (DAG.isKnownNeverNaN(Op0))
55331 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
55332
55333 // If we have to respect NaN inputs, this takes at least 3 instructions.
55334 // Favor a library call when operating on a scalar and minimizing code size.
55335 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
55336 return SDValue();
55337
55338 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
55339 VT);
55340
55341 // There are 4 possibilities involving NaN inputs, and these are the required
55342 // outputs:
55343 // Op1
55344 // Num NaN
55345 // ----------------
55346 // Num | Max | Op0 |
55347 // Op0 ----------------
55348 // NaN | Op1 | NaN |
55349 // ----------------
55350 //
55351 // The SSE FP max/min instructions were not designed for this case, but rather
55352 // to implement:
55353 // Min = Op1 < Op0 ? Op1 : Op0
55354 // Max = Op1 > Op0 ? Op1 : Op0
55355 //
55356 // So they always return Op0 if either input is a NaN. However, we can still
55357 // use those instructions for fmaxnum by selecting away a NaN input.
55358
55359 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
55360 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
55361 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
55362
55363 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
55364 // are NaN, the NaN value of Op1 is the result.
55365 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
55366}
55367
55370 EVT VT = N->getValueType(0);
55371 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55372
55373 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
55374 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
55375 return SDValue(N, 0);
55376
55377 // Convert a full vector load into vzload when not all bits are needed.
55378 SDValue In = N->getOperand(0);
55379 MVT InVT = In.getSimpleValueType();
55380 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55381 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55382 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55383 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
55384 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55385 MVT MemVT = MVT::getIntegerVT(NumBits);
55386 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55387 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55388 SDLoc dl(N);
55389 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
55390 DAG.getBitcast(InVT, VZLoad));
55391 DCI.CombineTo(N, Convert);
55392 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55394 return SDValue(N, 0);
55395 }
55396 }
55397
55398 return SDValue();
55399}
55400
55404 bool IsStrict = TSI.isTargetStrictFPOpcode(N->getOpcode());
55405 EVT VT = N->getValueType(0);
55406
55407 // Convert a full vector load into vzload when not all bits are needed.
55408 SDValue In = N->getOperand(IsStrict ? 1 : 0);
55409 MVT InVT = In.getSimpleValueType();
55410 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55411 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55412 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55413 LoadSDNode *LN = cast<LoadSDNode>(In);
55414 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55415 MVT MemVT = MVT::getFloatingPointVT(NumBits);
55416 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55417 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55418 SDLoc dl(N);
55419 if (IsStrict) {
55420 SDValue Convert =
55421 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
55422 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
55423 DCI.CombineTo(N, Convert, Convert.getValue(1));
55424 } else {
55425 SDValue Convert =
55426 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
55427 DCI.CombineTo(N, Convert);
55428 }
55429 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55431 return SDValue(N, 0);
55432 }
55433 }
55434
55435 return SDValue();
55436}
55437
55438/// Do target-specific dag combines on X86ISD::ANDNP nodes.
55441 const X86Subtarget &Subtarget) {
55442 SDValue N0 = N->getOperand(0);
55443 SDValue N1 = N->getOperand(1);
55444 MVT VT = N->getSimpleValueType(0);
55445 int NumElts = VT.getVectorNumElements();
55446 unsigned EltSizeInBits = VT.getScalarSizeInBits();
55447 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55448 SDLoc DL(N);
55449
55450 // ANDNP(undef, x) -> 0
55451 // ANDNP(x, undef) -> 0
55452 if (N0.isUndef() || N1.isUndef())
55453 return DAG.getConstant(0, DL, VT);
55454
55455 // ANDNP(0, x) -> x
55457 return N1;
55458
55459 // ANDNP(x, 0) -> 0
55461 return DAG.getConstant(0, DL, VT);
55462
55463 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
55465 return DAG.getNOT(DL, N0, VT);
55466
55467 // Turn ANDNP back to AND if input is inverted.
55468 if (SDValue Not = IsNOT(N0, DAG))
55469 return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
55470
55471 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
55472 // to make use of predicated selects.
55473 // ANDN(SEXT(SETCC()),X) -> SELECT(NOT(SETCC()),X,0)
55474 if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::SIGN_EXTEND) {
55475 SDValue Src = N0.getOperand(0);
55476 EVT SrcVT = Src.getValueType();
55477 if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 &&
55478 (VT.is512BitVector() || Subtarget.hasVLX()) &&
55479 (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
55480 TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse())
55481 return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1,
55482 getZeroVector(VT, Subtarget, DAG, DL));
55483 }
55484
55485 // Constant Folding
55486 APInt Undefs0, Undefs1;
55487 SmallVector<APInt> EltBits0, EltBits1;
55488 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0,
55489 /*AllowWholeUndefs*/ true,
55490 /*AllowPartialUndefs*/ true)) {
55491 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1,
55492 /*AllowWholeUndefs*/ true,
55493 /*AllowPartialUndefs*/ true)) {
55494 SmallVector<APInt> ResultBits;
55495 for (int I = 0; I != NumElts; ++I)
55496 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
55497 return getConstVector(ResultBits, VT, DAG, DL);
55498 }
55499
55500 // Constant fold NOT(N0) to allow us to use AND.
55501 // Ensure this is only performed if we can confirm that the bitcasted source
55502 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
55503 if (N0->hasOneUse()) {
55505 if (BC0.getOpcode() != ISD::BITCAST) {
55506 for (APInt &Elt : EltBits0)
55507 Elt = ~Elt;
55508 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
55509 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
55510 }
55511 }
55512 }
55513
55514 // Attempt to recursively combine a bitmask ANDNP with shuffles.
55515 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
55516 SDValue Op(N, 0);
55517 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
55518 return Res;
55519
55520 // If either operand is a constant mask, then only the elements that aren't
55521 // zero are actually demanded by the other operand.
55522 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
55523 APInt UndefElts;
55524 SmallVector<APInt> EltBits;
55525 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
55526 APInt DemandedElts = APInt::getAllOnes(NumElts);
55527 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
55528 EltBits)) {
55529 DemandedBits.clearAllBits();
55530 DemandedElts.clearAllBits();
55531 for (int I = 0; I != NumElts; ++I) {
55532 if (UndefElts[I]) {
55533 // We can't assume an undef src element gives an undef dst - the
55534 // other src might be zero.
55535 DemandedBits.setAllBits();
55536 DemandedElts.setBit(I);
55537 } else if ((Invert && !EltBits[I].isAllOnes()) ||
55538 (!Invert && !EltBits[I].isZero())) {
55539 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
55540 DemandedElts.setBit(I);
55541 }
55542 }
55543 }
55544 return std::make_pair(DemandedBits, DemandedElts);
55545 };
55546 APInt Bits0, Elts0;
55547 APInt Bits1, Elts1;
55548 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
55549 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
55550
55551 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
55552 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
55553 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
55554 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
55555 if (N->getOpcode() != ISD::DELETED_NODE)
55556 DCI.AddToWorklist(N);
55557 return SDValue(N, 0);
55558 }
55559 }
55560
55561 // Folds for better commutativity:
55562 if (N1->hasOneUse()) {
55563 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
55564 if (SDValue Not = IsNOT(N1, DAG))
55565 return DAG.getNOT(
55566 DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
55567
55568 // ANDNP(x,PSHUFB(y,z)) -> PSHUFB(y,OR(z,x))
55569 // Zero out elements by setting the PSHUFB mask value to 0xFF.
55570 if (DAG.ComputeNumSignBits(N0) == EltSizeInBits) {
55572 if (BC1.getOpcode() == X86ISD::PSHUFB) {
55573 EVT ShufVT = BC1.getValueType();
55574 SDValue NewMask = DAG.getNode(ISD::OR, DL, ShufVT, BC1.getOperand(1),
55575 DAG.getBitcast(ShufVT, N0));
55576 SDValue NewShuf =
55577 DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, BC1.getOperand(0), NewMask);
55578 return DAG.getBitcast(VT, NewShuf);
55579 }
55580 }
55581 }
55582
55583 return SDValue();
55584}
55585
55588 SDValue N1 = N->getOperand(1);
55589
55590 // BT ignores high bits in the bit index operand.
55591 unsigned BitWidth = N1.getValueSizeInBits();
55593 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
55594 if (N->getOpcode() != ISD::DELETED_NODE)
55595 DCI.AddToWorklist(N);
55596 return SDValue(N, 0);
55597 }
55598
55599 return SDValue();
55600}
55601
55604 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
55605 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
55606
55607 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
55608 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55609 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
55610 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
55611 if (N->getOpcode() != ISD::DELETED_NODE)
55612 DCI.AddToWorklist(N);
55613 return SDValue(N, 0);
55614 }
55615
55616 // Convert a full vector load into vzload when not all bits are needed.
55617 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
55618 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
55619 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
55620 SDLoc dl(N);
55621 if (IsStrict) {
55622 SDValue Convert = DAG.getNode(
55623 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
55624 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
55625 DCI.CombineTo(N, Convert, Convert.getValue(1));
55626 } else {
55627 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
55628 DAG.getBitcast(MVT::v8i16, VZLoad));
55629 DCI.CombineTo(N, Convert);
55630 }
55631
55632 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55634 return SDValue(N, 0);
55635 }
55636 }
55637 }
55638
55639 return SDValue();
55640}
55641
55642// Try to combine sext_in_reg of a cmov of constants by extending the constants.
55644 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55645
55646 EVT DstVT = N->getValueType(0);
55647
55648 SDValue N0 = N->getOperand(0);
55649 SDValue N1 = N->getOperand(1);
55650 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55651
55652 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
55653 return SDValue();
55654
55655 // Look through single use any_extends / truncs.
55656 SDValue IntermediateBitwidthOp;
55657 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
55658 N0.hasOneUse()) {
55659 IntermediateBitwidthOp = N0;
55660 N0 = N0.getOperand(0);
55661 }
55662
55663 // See if we have a single use cmov.
55664 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
55665 return SDValue();
55666
55667 SDValue CMovOp0 = N0.getOperand(0);
55668 SDValue CMovOp1 = N0.getOperand(1);
55669
55670 // Make sure both operands are constants.
55671 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55672 !isa<ConstantSDNode>(CMovOp1.getNode()))
55673 return SDValue();
55674
55675 SDLoc DL(N);
55676
55677 // If we looked through an any_extend/trunc above, add one to the constants.
55678 if (IntermediateBitwidthOp) {
55679 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
55680 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
55681 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
55682 }
55683
55684 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
55685 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
55686
55687 EVT CMovVT = DstVT;
55688 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
55689 if (DstVT == MVT::i16) {
55690 CMovVT = MVT::i32;
55691 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
55692 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
55693 }
55694
55695 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
55696 N0.getOperand(2), N0.getOperand(3));
55697
55698 if (CMovVT != DstVT)
55699 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
55700
55701 return CMov;
55702}
55703
55705 const X86Subtarget &Subtarget) {
55706 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55707
55708 if (SDValue V = combineSextInRegCmov(N, DAG))
55709 return V;
55710
55711 EVT VT = N->getValueType(0);
55712 SDValue N0 = N->getOperand(0);
55713 SDValue N1 = N->getOperand(1);
55714 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55715 SDLoc dl(N);
55716
55717 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
55718 // both SSE and AVX2 since there is no sign-extended shift right
55719 // operation on a vector with 64-bit elements.
55720 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
55721 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
55722 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
55723 N0.getOpcode() == ISD::SIGN_EXTEND)) {
55724 SDValue N00 = N0.getOperand(0);
55725
55726 // EXTLOAD has a better solution on AVX2,
55727 // it may be replaced with X86ISD::VSEXT node.
55728 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
55729 if (!ISD::isNormalLoad(N00.getNode()))
55730 return SDValue();
55731
55732 // Attempt to promote any comparison mask ops before moving the
55733 // SIGN_EXTEND_INREG in the way.
55734 if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))
55735 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
55736
55737 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
55738 SDValue Tmp =
55739 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
55740 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
55741 }
55742 }
55743 return SDValue();
55744}
55745
55746/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
55747/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
55748/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
55749/// opportunities to combine math ops, use an LEA, or use a complex addressing
55750/// mode. This can eliminate extend, add, and shift instructions.
55752 const X86Subtarget &Subtarget) {
55753 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
55754 Ext->getOpcode() != ISD::ZERO_EXTEND)
55755 return SDValue();
55756
55757 // TODO: This should be valid for other integer types.
55758 EVT VT = Ext->getValueType(0);
55759 if (VT != MVT::i64)
55760 return SDValue();
55761
55762 SDValue Add = Ext->getOperand(0);
55763 if (Add.getOpcode() != ISD::ADD)
55764 return SDValue();
55765
55766 SDValue AddOp0 = Add.getOperand(0);
55767 SDValue AddOp1 = Add.getOperand(1);
55768 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
55769 bool NSW = Add->getFlags().hasNoSignedWrap();
55770 bool NUW = Add->getFlags().hasNoUnsignedWrap();
55771 NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
55772 NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
55773
55774 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
55775 // into the 'zext'
55776 if ((Sext && !NSW) || (!Sext && !NUW))
55777 return SDValue();
55778
55779 // Having a constant operand to the 'add' ensures that we are not increasing
55780 // the instruction count because the constant is extended for free below.
55781 // A constant operand can also become the displacement field of an LEA.
55782 auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
55783 if (!AddOp1C)
55784 return SDValue();
55785
55786 // Don't make the 'add' bigger if there's no hope of combining it with some
55787 // other 'add' or 'shl' instruction.
55788 // TODO: It may be profitable to generate simpler LEA instructions in place
55789 // of single 'add' instructions, but the cost model for selecting an LEA
55790 // currently has a high threshold.
55791 bool HasLEAPotential = false;
55792 for (auto *User : Ext->users()) {
55793 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
55794 HasLEAPotential = true;
55795 break;
55796 }
55797 }
55798 if (!HasLEAPotential)
55799 return SDValue();
55800
55801 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
55802 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
55803 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
55804 SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
55805
55806 // The wider add is guaranteed to not wrap because both operands are
55807 // sign-extended.
55808 SDNodeFlags Flags;
55809 Flags.setNoSignedWrap(NSW);
55810 Flags.setNoUnsignedWrap(NUW);
55811 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
55812}
55813
55814// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
55815// operands and the result of CMOV is not used anywhere else - promote CMOV
55816// itself instead of promoting its result. This could be beneficial, because:
55817// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
55818// (or more) pseudo-CMOVs only when they go one-after-another and
55819// getting rid of result extension code after CMOV will help that.
55820// 2) Promotion of constant CMOV arguments is free, hence the
55821// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
55822// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
55823// promotion is also good in terms of code-size.
55824// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
55825// promotion).
55827 SDValue CMovN = Extend->getOperand(0);
55828 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
55829 return SDValue();
55830
55831 EVT TargetVT = Extend->getValueType(0);
55832 unsigned ExtendOpcode = Extend->getOpcode();
55833 SDLoc DL(Extend);
55834
55835 EVT VT = CMovN.getValueType();
55836 SDValue CMovOp0 = CMovN.getOperand(0);
55837 SDValue CMovOp1 = CMovN.getOperand(1);
55838
55839 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55840 !isa<ConstantSDNode>(CMovOp1.getNode()))
55841 return SDValue();
55842
55843 // Only extend to i32 or i64.
55844 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
55845 return SDValue();
55846
55847 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
55848 // are free.
55849 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
55850 return SDValue();
55851
55852 // If this a zero extend to i64, we should only extend to i32 and use a free
55853 // zero extend to finish.
55854 EVT ExtendVT = TargetVT;
55855 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
55856 ExtendVT = MVT::i32;
55857
55858 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
55859 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
55860
55861 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
55862 CMovN.getOperand(2), CMovN.getOperand(3));
55863
55864 // Finish extending if needed.
55865 if (ExtendVT != TargetVT)
55866 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
55867
55868 return Res;
55869}
55870
55871// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
55872// result type.
55874 const X86Subtarget &Subtarget) {
55875 SDValue N0 = N->getOperand(0);
55876 EVT VT = N->getValueType(0);
55877 SDLoc dl(N);
55878
55879 // Only do this combine with AVX512 for vector extends.
55880 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
55881 return SDValue();
55882
55883 // Only combine legal element types.
55884 EVT SVT = VT.getVectorElementType();
55885 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
55886 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
55887 return SDValue();
55888
55889 // We don't have CMPP Instruction for vxf16
55890 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
55891 return SDValue();
55892 // We can only do this if the vector size in 256 bits or less.
55893 unsigned Size = VT.getSizeInBits();
55894 if (Size > 256 && Subtarget.useAVX512Regs())
55895 return SDValue();
55896
55897 EVT N00VT = N0.getOperand(0).getValueType();
55898
55899 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
55900 // that's the only integer compares with we have.
55902 if (N00VT.isInteger() && ISD::isUnsignedIntSetCC(CC))
55903 return SDValue();
55904
55905 // Only do this combine if the extension will be fully consumed by the setcc.
55906 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
55907 if (Size != MatchingVecType.getSizeInBits())
55908 return SDValue();
55909
55910 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
55911
55912 if (N->getOpcode() == ISD::ZERO_EXTEND)
55913 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
55914
55915 return Res;
55916}
55917
55920 const X86Subtarget &Subtarget) {
55921 SDValue N0 = N->getOperand(0);
55922 EVT VT = N->getValueType(0);
55923 SDLoc DL(N);
55924
55925 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
55926 if (!DCI.isBeforeLegalizeOps() &&
55928 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
55929 N0->getOperand(1));
55930 bool ReplaceOtherUses = !N0.hasOneUse();
55931 DCI.CombineTo(N, Setcc);
55932 // Replace other uses with a truncate of the widened setcc_carry.
55933 if (ReplaceOtherUses) {
55934 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
55935 N0.getValueType(), Setcc);
55936 DCI.CombineTo(N0.getNode(), Trunc);
55937 }
55938
55939 return SDValue(N, 0);
55940 }
55941
55942 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
55943 return NewCMov;
55944
55945 if (!DCI.isBeforeLegalizeOps())
55946 return SDValue();
55947
55948 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
55949 return V;
55950
55951 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
55952 DAG, DCI, Subtarget))
55953 return V;
55954
55955 if (VT.isVector()) {
55956 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
55957 return R;
55958
55960 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
55961 }
55962
55963 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
55964 return NewAdd;
55965
55966 return SDValue();
55967}
55968
55969// Inverting a constant vector is profitable if it can be eliminated and the
55970// inverted vector is already present in DAG. Otherwise, it will be loaded
55971// anyway.
55972//
55973// We determine which of the values can be completely eliminated and invert it.
55974// If both are eliminable, select a vector with the first negative element.
55977 "ConstantFP build vector expected");
55978 // Check if we can eliminate V. We assume if a value is only used in FMAs, we
55979 // can eliminate it. Since this function is invoked for each FMA with this
55980 // vector.
55981 auto IsNotFMA = [](SDNode *User) {
55982 return User->getOpcode() != ISD::FMA &&
55983 User->getOpcode() != ISD::STRICT_FMA;
55984 };
55985 if (llvm::any_of(V->users(), IsNotFMA))
55986 return SDValue();
55987
55989 EVT VT = V.getValueType();
55990 EVT EltVT = VT.getVectorElementType();
55991 for (const SDValue &Op : V->op_values()) {
55992 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
55993 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
55994 } else {
55995 assert(Op.isUndef());
55996 Ops.push_back(DAG.getUNDEF(EltVT));
55997 }
55998 }
55999
56001 if (!NV)
56002 return SDValue();
56003
56004 // If an inverted version cannot be eliminated, choose it instead of the
56005 // original version.
56006 if (llvm::any_of(NV->users(), IsNotFMA))
56007 return SDValue(NV, 0);
56008
56009 // If the inverted version also can be eliminated, we have to consistently
56010 // prefer one of the values. We prefer a constant with a negative value on
56011 // the first place.
56012 // N.B. We need to skip undefs that may precede a value.
56013 for (const SDValue &Op : V->op_values()) {
56014 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
56015 if (Cst->isNegative())
56016 return SDValue();
56017 break;
56018 }
56019 }
56020 return SDValue(NV, 0);
56021}
56022
56025 const X86Subtarget &Subtarget) {
56026 SDLoc dl(N);
56027 EVT VT = N->getValueType(0);
56029 bool IsStrict = N->isTargetOpcode()
56030 ? TSI.isTargetStrictFPOpcode(N->getOpcode())
56031 : N->isStrictFPOpcode();
56032
56033 // Let legalize expand this if it isn't a legal type yet.
56034 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56035 if (!TLI.isTypeLegal(VT))
56036 return SDValue();
56037
56038 SDValue A = N->getOperand(IsStrict ? 1 : 0);
56039 SDValue B = N->getOperand(IsStrict ? 2 : 1);
56040 SDValue C = N->getOperand(IsStrict ? 3 : 2);
56041
56042 // If the operation allows fast-math and the target does not support FMA,
56043 // split this into mul+add to avoid libcall(s).
56044 SDNodeFlags Flags = N->getFlags();
56045 if (!IsStrict && Flags.hasAllowReassociation() &&
56046 TLI.isOperationExpand(ISD::FMA, VT)) {
56047 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
56048 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
56049 }
56050
56051 EVT ScalarVT = VT.getScalarType();
56052 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
56053 !Subtarget.hasAnyFMA()) &&
56054 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()) &&
56055 !(ScalarVT == MVT::bf16 && Subtarget.hasAVX10_2()))
56056 return SDValue();
56057
56058 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
56060 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56061 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
56062 CodeSize)) {
56063 V = NegV;
56064 return true;
56065 }
56066 // Look through extract_vector_elts. If it comes from an FNEG, create a
56067 // new extract from the FNEG input.
56068 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
56069 isNullConstant(V.getOperand(1))) {
56070 SDValue Vec = V.getOperand(0);
56071 if (SDValue NegV = TLI.getCheaperNegatedExpression(
56072 Vec, DAG, LegalOperations, CodeSize)) {
56073 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
56074 NegV, V.getOperand(1));
56075 return true;
56076 }
56077 }
56078 // Lookup if there is an inverted version of constant vector V in DAG.
56079 if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
56080 if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
56081 V = NegV;
56082 return true;
56083 }
56084 }
56085 return false;
56086 };
56087
56088 // Do not convert the passthru input of scalar intrinsics.
56089 // FIXME: We could allow negations of the lower element only.
56090 bool NegA = invertIfNegative(A);
56091 // Create a dummy use for A so that in the process of negating B or C
56092 // recursively, it is not deleted.
56093 HandleSDNode NegAHandle(A);
56094 bool NegB = invertIfNegative(B);
56095 // Similar to A, get a handle on B.
56096 HandleSDNode NegBHandle(B);
56097 bool NegC = invertIfNegative(C);
56098
56099 if (!NegA && !NegB && !NegC)
56100 return SDValue();
56101
56102 unsigned NewOpcode =
56103 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
56104
56105 // Propagate fast-math-flags to new FMA node.
56106 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
56107 if (IsStrict) {
56108 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
56109 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
56110 {N->getOperand(0), A, B, C});
56111 } else {
56112 if (N->getNumOperands() == 4)
56113 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
56114 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
56115 }
56116}
56117
56118// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
56119// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
56122 SDLoc dl(N);
56123 EVT VT = N->getValueType(0);
56124 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56126 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56127
56128 SDValue N2 = N->getOperand(2);
56129
56130 SDValue NegN2 =
56131 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
56132 if (!NegN2)
56133 return SDValue();
56134 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
56135
56136 if (N->getNumOperands() == 4)
56137 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56138 NegN2, N->getOperand(3));
56139 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56140 NegN2);
56141}
56142
56143// Try to widen the build vector and bitcast it to the type of zext.
56144// This is a special case for the 128-bit vector types. Intention is to remove
56145// the zext and replace it with a bitcast the wider type. While lowering
56146// the bitcast is removed and extra commutation due to zext is avoided.
56147// For example:
56148// zext v4i16 ( v4i8 build_vector (x, y, z, w)) -> bitcast v4i16 ( v8i8
56149// build_vector (x, 0, y, 0, z, w, 0)
56151
56152 if (Extend->getOpcode() != ISD::ZERO_EXTEND)
56153 return SDValue();
56154
56155 EVT ExtendVT = Extend->getValueType(0);
56156
56157 SDValue BV = Extend->getOperand(0);
56158 if (BV.getOpcode() != ISD::BUILD_VECTOR || !BV.hasOneUse())
56159 return SDValue();
56160
56161 if (any_of(BV->op_values(), [](SDValue Op) { return Op.isUndef(); })) {
56162 // If the build vector has undef elements, we cannot widen it.
56163 // The widening would create a vector with more undef elements, which
56164 // is not valid.
56165 return SDValue();
56166 }
56167
56168 if (!all_of(BV->op_values(),
56169 [](SDValue Op) { return Op.getOpcode() == ISD::LOAD; })) {
56170 // If the build vector any element other than \ISD::LOAD, we cannot widen
56171 // it.
56172 return SDValue();
56173 }
56174
56175 SDLoc dl(BV);
56176 EVT VT = BV.getValueType();
56177 EVT EltVT = BV.getOperand(0).getValueType();
56178 unsigned NumElts = VT.getVectorNumElements();
56179
56180 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56181
56182 if (TLI.getTypeAction(*DAG.getContext(), VT) !=
56184 return SDValue();
56185
56186 EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
56187 unsigned WidenNumElts = WidenVT.getVectorNumElements();
56188
56189 SmallVector<SDValue, 16> NewOps(BV->op_begin(), BV->op_end());
56190 assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
56191 // Fill the new elements with Zero.
56192 NewOps.append(WidenNumElts - NumElts, DAG.getConstant(0, dl, EltVT));
56193 // Compute the step to place the elements in the right place and control the
56194 // iteration.
56195 unsigned step = WidenNumElts / NumElts;
56196 if (WidenVT.is128BitVector()) {
56197 if (step > 1 && Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) {
56198 for (int i = NumElts - 1, j = WidenNumElts - step; i > 0;
56199 i--, j -= step) {
56200 SDValue temp = NewOps[i];
56201 NewOps[i] = NewOps[j];
56202 NewOps[j] = temp;
56203 }
56204 // Create new build vector with WidenVT and NewOps
56205 SDValue NewBV = DAG.getBuildVector(WidenVT, dl, NewOps);
56206 // Replace the old build vector with the new one. Bitcast the
56207 // new build vector to the type of the zext.
56208 SDValue NewBVBitcast = DAG.getBitcast(ExtendVT, NewBV);
56209 DAG.ReplaceAllUsesOfValueWith(SDValue(Extend, 0), NewBVBitcast);
56210 return NewBV;
56211 }
56212 }
56213 return SDValue();
56214}
56215
56218 const X86Subtarget &Subtarget) {
56219 SDLoc dl(N);
56220 SDValue N0 = N->getOperand(0);
56221 EVT VT = N->getValueType(0);
56222
56223 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
56224 // FIXME: Is this needed? We don't seem to have any tests for it.
56225 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
56227 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
56228 N0->getOperand(1));
56229 bool ReplaceOtherUses = !N0.hasOneUse();
56230 DCI.CombineTo(N, Setcc);
56231 // Replace other uses with a truncate of the widened setcc_carry.
56232 if (ReplaceOtherUses) {
56233 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
56234 N0.getValueType(), Setcc);
56235 DCI.CombineTo(N0.getNode(), Trunc);
56236 }
56237
56238 return SDValue(N, 0);
56239 }
56240
56241 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
56242 return NewCMov;
56243
56244 if (DCI.isBeforeLegalizeOps())
56245 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
56246 return V;
56247
56248 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
56249 DAG, DCI, Subtarget))
56250 return V;
56251
56252 if (VT.isVector())
56253 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))
56254 return R;
56255
56256 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
56257 return NewAdd;
56258
56259 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
56260 return R;
56261
56262 // TODO: Combine with any target/faux shuffle.
56263 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
56265 SDValue N00 = N0.getOperand(0);
56266 SDValue N01 = N0.getOperand(1);
56267 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
56268 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
56269 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
56270 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
56271 return concatSubVectors(N00, N01, DAG, dl);
56272 }
56273 }
56274
56275 if (SDValue V = widenBuildVec(N, DAG))
56276 return V;
56277
56278 return SDValue();
56279}
56280
56281/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
56282/// pre-promote its result type since vXi1 vectors don't get promoted
56283/// during type legalization.
56286 const SDLoc &DL, SelectionDAG &DAG,
56287 const X86Subtarget &Subtarget) {
56288 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
56289 VT.getVectorElementType() == MVT::i1 &&
56290 (OpVT.getVectorElementType() == MVT::i8 ||
56291 OpVT.getVectorElementType() == MVT::i16)) {
56292 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
56293 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
56294 }
56295 return SDValue();
56296}
56297
56298// The pattern (setcc (and (broadcast x), (2^n, 2^{n+1}, ...)), (0, 0, ...),
56299// eq/ne) is generated when using an integer as a mask. Instead of generating a
56300// broadcast + vptest, we can directly move the integer to a mask register.
56302 const SDLoc &DL, SelectionDAG &DAG,
56303 const X86Subtarget &Subtarget) {
56304 if (CC != ISD::SETNE && CC != ISD::SETEQ)
56305 return SDValue();
56306
56307 if (!Subtarget.hasAVX512())
56308 return SDValue();
56309
56310 if (Op0.getOpcode() != ISD::AND)
56311 return SDValue();
56312
56313 SDValue Broadcast = Op0.getOperand(0);
56314 if (Broadcast.getOpcode() != X86ISD::VBROADCAST &&
56315 Broadcast.getOpcode() != X86ISD::VBROADCAST_LOAD)
56316 return SDValue();
56317
56318 SDValue Load = Op0.getOperand(1);
56319 EVT LoadVT = Load.getSimpleValueType();
56320
56321 APInt UndefElts;
56322 SmallVector<APInt, 32> EltBits;
56324 UndefElts, EltBits,
56325 /*AllowWholeUndefs*/ true,
56326 /*AllowPartialUndefs*/ false) ||
56327 UndefElts[0] || !EltBits[0].isPowerOf2() || UndefElts.getBitWidth() > 16)
56328 return SDValue();
56329
56330 // Check if the constant pool contains only powers of 2 starting from some
56331 // 2^N. The table may also contain undefs because of widening of vector
56332 // operands.
56333 unsigned N = EltBits[0].logBase2();
56334 unsigned Len = UndefElts.getBitWidth();
56335 for (unsigned I = 1; I != Len; ++I) {
56336 if (UndefElts[I]) {
56337 if (!UndefElts.extractBits(Len - (I + 1), I + 1).isAllOnes())
56338 return SDValue();
56339 break;
56340 }
56341
56342 if (EltBits[I].getBitWidth() <= N + I || !EltBits[I].isOneBitSet(N + I))
56343 return SDValue();
56344 }
56345
56346 MVT BroadcastOpVT = Broadcast.getSimpleValueType().getVectorElementType();
56347 SDValue BroadcastOp;
56348 if (Broadcast.getOpcode() != X86ISD::VBROADCAST) {
56349 BroadcastOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, BroadcastOpVT,
56350 Broadcast, DAG.getVectorIdxConstant(0, DL));
56351 } else {
56352 BroadcastOp = Broadcast.getOperand(0);
56353 if (BroadcastOp.getValueType().isVector())
56354 return SDValue();
56355 }
56356
56357 SDValue Masked = BroadcastOp;
56358 if (N != 0) {
56359 unsigned BroadcastOpBitWidth = BroadcastOpVT.getSizeInBits();
56360 unsigned NumDefinedElts = UndefElts.countTrailingZeros();
56361
56362 if (NumDefinedElts > BroadcastOpBitWidth)
56363 return SDValue();
56364
56365 APInt Mask = APInt::getLowBitsSet(BroadcastOpBitWidth, NumDefinedElts);
56366 SDValue ShiftedValue = DAG.getNode(ISD::SRL, DL, BroadcastOpVT, BroadcastOp,
56367 DAG.getConstant(N, DL, BroadcastOpVT));
56368 Masked = DAG.getNode(ISD::AND, DL, BroadcastOpVT, ShiftedValue,
56369 DAG.getConstant(Mask, DL, BroadcastOpVT));
56370 }
56371 // We can't extract more than 16 bits using this pattern, because 2^{17} will
56372 // not fit in an i16 and a vXi32 where X > 16 is more than 512 bits.
56373 SDValue Trunc = DAG.getAnyExtOrTrunc(Masked, DL, MVT::i16);
56374 SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, MVT::v16i1, Trunc);
56375
56376 if (CC == ISD::SETEQ)
56377 Bitcast = DAG.getNOT(DL, Bitcast, MVT::v16i1);
56378
56379 if (VT != MVT::v16i1)
56380 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Bitcast,
56381 DAG.getVectorIdxConstant(0, DL));
56382
56383 return Bitcast;
56384}
56385
56388 const X86Subtarget &Subtarget) {
56389 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
56390 const SDValue LHS = N->getOperand(0);
56391 const SDValue RHS = N->getOperand(1);
56392 EVT VT = N->getValueType(0);
56393 EVT OpVT = LHS.getValueType();
56394 SDLoc DL(N);
56395
56396 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56397 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
56398 Subtarget))
56399 return V;
56400 }
56401
56402 if (VT == MVT::i1) {
56403 X86::CondCode X86CC;
56404 if (SDValue V =
56405 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
56406 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
56407 }
56408
56409 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56410 if (OpVT.isScalarInteger()) {
56411 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
56412 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
56413 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
56414 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
56415 if (N0.getOperand(0) == N1)
56416 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56417 N0.getOperand(1));
56418 if (N0.getOperand(1) == N1)
56419 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56420 N0.getOperand(0));
56421 }
56422 return SDValue();
56423 };
56424 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
56425 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56426 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
56427 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56428
56429 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
56430 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
56431 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
56432 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
56433 if (N0.getOperand(0) == N1)
56434 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56435 DAG.getNOT(DL, N0.getOperand(1), OpVT));
56436 if (N0.getOperand(1) == N1)
56437 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56438 DAG.getNOT(DL, N0.getOperand(0), OpVT));
56439 }
56440 return SDValue();
56441 };
56442 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
56443 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56444 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
56445 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56446
56447 // cmpeq(trunc(x),C) --> cmpeq(x,C)
56448 // cmpne(trunc(x),C) --> cmpne(x,C)
56449 // iff x upper bits are zero.
56450 if (LHS.getOpcode() == ISD::TRUNCATE &&
56451 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
56453 EVT SrcVT = LHS.getOperand(0).getValueType();
56455 OpVT.getScalarSizeInBits());
56456 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56457 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
56458 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
56459 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
56460 DAG.getZExtOrTrunc(RHS, DL, SrcVT), CC);
56461 }
56462
56463 // With C as a power of 2 and C != 0 and C != INT_MIN:
56464 // icmp eq Abs(X) C ->
56465 // (icmp eq A, C) | (icmp eq A, -C)
56466 // icmp ne Abs(X) C ->
56467 // (icmp ne A, C) & (icmp ne A, -C)
56468 // Both of these patterns can be better optimized in
56469 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
56470 // integers which is checked above.
56471 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
56472 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
56473 const APInt &CInt = C->getAPIntValue();
56474 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
56475 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
56476 SDValue BaseOp = LHS.getOperand(0);
56477 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
56478 SDValue SETCC1 = DAG.getSetCC(
56479 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
56480 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
56481 SETCC0, SETCC1);
56482 }
56483 }
56484 }
56485 }
56486 }
56487
56488 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
56489 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
56490 // Using temporaries to avoid messing up operand ordering for later
56491 // transformations if this doesn't work.
56492 SDValue Op0 = LHS;
56493 SDValue Op1 = RHS;
56494 ISD::CondCode TmpCC = CC;
56495 // Put build_vector on the right.
56496 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
56497 std::swap(Op0, Op1);
56498 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
56499 }
56500
56501 bool IsSEXT0 =
56502 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
56503 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
56504 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
56505
56506 if (IsSEXT0 && IsVZero1) {
56507 assert(VT == Op0.getOperand(0).getValueType() &&
56508 "Unexpected operand type");
56509 if (TmpCC == ISD::SETGT)
56510 return DAG.getConstant(0, DL, VT);
56511 if (TmpCC == ISD::SETLE)
56512 return DAG.getConstant(1, DL, VT);
56513 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
56514 return DAG.getNOT(DL, Op0.getOperand(0), VT);
56515
56516 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
56517 "Unexpected condition code!");
56518 return Op0.getOperand(0);
56519 }
56520
56521 if (IsVZero1)
56522 if (SDValue V =
56523 combineAVX512SetCCToKMOV(VT, Op0, TmpCC, DL, DAG, Subtarget))
56524 return V;
56525 }
56526
56527 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
56528 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
56529 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
56530 // a mask, there are signed AVX512 comparisons).
56531 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
56532 bool CanMakeSigned = false;
56533 if (ISD::isUnsignedIntSetCC(CC)) {
56534 KnownBits CmpKnown =
56536 // If we know LHS/RHS share the same sign bit at each element we can
56537 // make this signed.
56538 // NOTE: `computeKnownBits` on a vector type aggregates common bits
56539 // across all lanes. So a pattern where the sign varies from lane to
56540 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
56541 // missed. We could get around this by demanding each lane
56542 // independently, but this isn't the most important optimization and
56543 // that may eat into compile time.
56544 CanMakeSigned =
56545 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
56546 }
56547 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
56548 SDValue LHSOut = LHS;
56549 SDValue RHSOut = RHS;
56550 ISD::CondCode NewCC = CC;
56551 switch (CC) {
56552 case ISD::SETGE:
56553 case ISD::SETUGE:
56554 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
56555 /*NSW*/ true))
56556 LHSOut = NewLHS;
56557 else if (SDValue NewRHS = incDecVectorConstant(
56558 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
56559 RHSOut = NewRHS;
56560 else
56561 break;
56562
56563 [[fallthrough]];
56564 case ISD::SETUGT:
56565 NewCC = ISD::SETGT;
56566 break;
56567
56568 case ISD::SETLE:
56569 case ISD::SETULE:
56570 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
56571 /*NSW*/ true))
56572 LHSOut = NewLHS;
56573 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
56574 /*NSW*/ true))
56575 RHSOut = NewRHS;
56576 else
56577 break;
56578
56579 [[fallthrough]];
56580 case ISD::SETULT:
56581 // Will be swapped to SETGT in LowerVSETCC*.
56582 NewCC = ISD::SETLT;
56583 break;
56584 default:
56585 break;
56586 }
56587 if (NewCC != CC) {
56588 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
56589 NewCC, DL, DAG, Subtarget))
56590 return R;
56591 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
56592 }
56593 }
56594 }
56595
56596 if (SDValue R =
56597 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
56598 return R;
56599
56600 // In the middle end transforms:
56601 // `(or (icmp eq X, C), (icmp eq X, C+1))`
56602 // -> `(icmp ult (add x, -C), 2)`
56603 // Likewise inverted cases with `ugt`.
56604 //
56605 // Since x86, pre avx512, doesn't have unsigned vector compares, this results
56606 // in worse codegen. So, undo the middle-end transform and go back to `(or
56607 // (icmp eq), (icmp eq))` form.
56608 // Also skip AVX1 with ymm vectors, as the umin approach combines better than
56609 // the xmm approach.
56610 //
56611 // NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp
56612 // ne))` as it doesn't end up instruction positive.
56613 // TODO: We might want to do this for avx512 as well if we `sext` the result.
56614 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() &&
56615 ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD &&
56616 !Subtarget.hasAVX512() &&
56617 (OpVT.getSizeInBits() <= 128 || !Subtarget.hasAVX() ||
56618 Subtarget.hasAVX2()) &&
56619 LHS.hasOneUse()) {
56620
56621 APInt CmpC;
56622 SDValue AddC = LHS.getOperand(1);
56623 if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) &&
56625 // See which form we have depending on the constant/condition.
56626 SDValue C0 = SDValue();
56627 SDValue C1 = SDValue();
56628
56629 // If we had `(add x, -1)` and can lower with `umin`, don't transform as
56630 // we will end up generating an additional constant. Keeping in the
56631 // current form has a slight latency cost, but it probably worth saving a
56632 // constant.
56635 // Pass
56636 }
56637 // Normal Cases
56638 else if ((CC == ISD::SETULT && CmpC == 2) ||
56639 (CC == ISD::SETULE && CmpC == 1)) {
56640 // These will constant fold.
56641 C0 = DAG.getNegative(AddC, DL, OpVT);
56642 C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0,
56643 DAG.getAllOnesConstant(DL, OpVT));
56644 }
56645 // Inverted Cases
56646 else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
56647 (CC == ISD::SETUGE && (-CmpC) == 2)) {
56648 // These will constant fold.
56649 C0 = DAG.getNOT(DL, AddC, OpVT);
56650 C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0,
56651 DAG.getAllOnesConstant(DL, OpVT));
56652 }
56653 if (C0 && C1) {
56654 SDValue NewLHS =
56655 DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ);
56656 SDValue NewRHS =
56657 DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ);
56658 return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS);
56659 }
56660 }
56661 }
56662
56663 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
56664 // to avoid scalarization via legalization because v4i32 is not a legal type.
56665 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
56666 LHS.getValueType() == MVT::v4f32)
56667 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
56668
56669 // X pred 0.0 --> X pred -X
56670 // If the negation of X already exists, use it in the comparison. This removes
56671 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
56672 // instructions in patterns with a 'select' node.
56674 SDVTList FNegVT = DAG.getVTList(OpVT);
56675 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
56676 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
56677 }
56678
56679 return SDValue();
56680}
56681
56684 const X86Subtarget &Subtarget) {
56685 SDValue Src = N->getOperand(0);
56686 MVT SrcVT = Src.getSimpleValueType();
56687 MVT VT = N->getSimpleValueType(0);
56688 unsigned NumBits = VT.getScalarSizeInBits();
56689 unsigned NumElts = SrcVT.getVectorNumElements();
56690 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
56691 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
56692
56693 // Perform constant folding.
56694 APInt UndefElts;
56695 SmallVector<APInt, 32> EltBits;
56696 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits,
56697 /*AllowWholeUndefs*/ true,
56698 /*AllowPartialUndefs*/ true)) {
56699 APInt Imm(32, 0);
56700 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
56701 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56702 Imm.setBit(Idx);
56703
56704 return DAG.getConstant(Imm, SDLoc(N), VT);
56705 }
56706
56707 // Look through int->fp bitcasts that don't change the element width.
56708 unsigned EltWidth = SrcVT.getScalarSizeInBits();
56709 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
56710 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
56711 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
56712
56713 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
56714 // with scalar comparisons.
56715 if (SDValue NotSrc = IsNOT(Src, DAG)) {
56716 SDLoc DL(N);
56717 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56718 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
56719 return DAG.getNode(ISD::XOR, DL, VT,
56720 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
56721 DAG.getConstant(NotMask, DL, VT));
56722 }
56723
56724 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
56725 // results with scalar comparisons.
56726 if (Src.getOpcode() == X86ISD::PCMPGT &&
56727 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
56728 SDLoc DL(N);
56729 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56730 return DAG.getNode(ISD::XOR, DL, VT,
56731 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
56732 DAG.getConstant(NotMask, DL, VT));
56733 }
56734
56735 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
56736 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
56737 // iff pow2splat(c1).
56738 // Use KnownBits to determine if only a single bit is non-zero
56739 // in each element (pow2 or zero), and shift that bit to the msb.
56740 if (Src.getOpcode() == X86ISD::PCMPEQ) {
56741 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
56742 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
56743 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
56744 if (KnownLHS.countMaxPopulation() == 1 &&
56745 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
56746 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
56747 SDLoc DL(N);
56748 MVT ShiftVT = SrcVT;
56749 SDValue ShiftLHS = Src.getOperand(0);
56750 SDValue ShiftRHS = Src.getOperand(1);
56751 if (ShiftVT.getScalarType() == MVT::i8) {
56752 // vXi8 shifts - we only care about the signbit so can use PSLLW.
56753 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
56754 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
56755 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
56756 }
56757 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56758 ShiftLHS, ShiftAmt, DAG);
56759 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56760 ShiftRHS, ShiftAmt, DAG);
56761 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
56762 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
56763 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
56764 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
56765 }
56766 }
56767
56768 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
56769 if (N->isOnlyUserOf(Src.getNode())) {
56771 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
56772 APInt UndefElts;
56773 SmallVector<APInt, 32> EltBits;
56774 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
56775 UndefElts, EltBits)) {
56776 APInt Mask = APInt::getZero(NumBits);
56777 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
56778 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56779 Mask.setBit(Idx);
56780 }
56781 SDLoc DL(N);
56782 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
56783 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
56784 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
56785 DAG.getConstant(Mask, DL, VT));
56786 }
56787 }
56788 }
56789
56790 // Simplify the inputs.
56791 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56792 APInt DemandedMask(APInt::getAllOnes(NumBits));
56793 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56794 return SDValue(N, 0);
56795
56796 return SDValue();
56797}
56798
56801 const X86Subtarget &Subtarget) {
56802 MVT VT = N->getSimpleValueType(0);
56803 unsigned NumBits = VT.getScalarSizeInBits();
56804
56805 // Simplify the inputs.
56806 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56807 APInt DemandedMask(APInt::getAllOnes(NumBits));
56808 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56809 return SDValue(N, 0);
56810
56811 return SDValue();
56812}
56813
56817 SDValue Mask = MemOp->getMask();
56818
56819 // With vector masks we only demand the upper bit of the mask.
56820 if (Mask.getScalarValueSizeInBits() != 1) {
56821 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56822 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
56823 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
56824 if (N->getOpcode() != ISD::DELETED_NODE)
56825 DCI.AddToWorklist(N);
56826 return SDValue(N, 0);
56827 }
56828 }
56829
56830 return SDValue();
56831}
56832
56834 SDValue Index, SDValue Base, SDValue Scale,
56835 SelectionDAG &DAG) {
56836 SDLoc DL(GorS);
56837
56838 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
56839 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
56840 Gather->getMask(), Base, Index, Scale } ;
56841 return DAG.getMaskedGather(Gather->getVTList(),
56842 Gather->getMemoryVT(), DL, Ops,
56843 Gather->getMemOperand(),
56844 Gather->getIndexType(),
56845 Gather->getExtensionType());
56846 }
56847 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
56848 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
56849 Scatter->getMask(), Base, Index, Scale };
56850 return DAG.getMaskedScatter(Scatter->getVTList(),
56851 Scatter->getMemoryVT(), DL,
56852 Ops, Scatter->getMemOperand(),
56853 Scatter->getIndexType(),
56854 Scatter->isTruncatingStore());
56855}
56856
56859 SDLoc DL(N);
56860 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
56861 SDValue Index = GorS->getIndex();
56862 SDValue Base = GorS->getBasePtr();
56863 SDValue Scale = GorS->getScale();
56864 EVT IndexVT = Index.getValueType();
56865 EVT IndexSVT = IndexVT.getVectorElementType();
56866 unsigned IndexWidth = Index.getScalarValueSizeInBits();
56867 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56868 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
56869
56870 if (DCI.isBeforeLegalize()) {
56871 // Attempt to move shifted index into the address scale, allows further
56872 // index truncation below.
56873 if (Index.getOpcode() == ISD::SHL && IndexSVT == PtrVT &&
56874 isa<ConstantSDNode>(Scale)) {
56875 unsigned ScaleAmt = Scale->getAsZExtVal();
56876 assert(isPowerOf2_32(ScaleAmt) && "Scale must be a power of 2");
56877 unsigned Log2ScaleAmt = Log2_32(ScaleAmt);
56878 unsigned MaskBits = IndexWidth - Log2ScaleAmt;
56879 APInt DemandedBits = APInt::getLowBitsSet(IndexWidth, MaskBits);
56880 if (TLI.SimplifyDemandedBits(Index, DemandedBits, DCI)) {
56881 if (N->getOpcode() != ISD::DELETED_NODE)
56882 DCI.AddToWorklist(N);
56883 return SDValue(N, 0);
56884 }
56885 if (auto MinShAmt = DAG.getValidMinimumShiftAmount(Index)) {
56886 if (*MinShAmt >= 1 && Log2ScaleAmt < 3 &&
56887 DAG.ComputeNumSignBits(Index.getOperand(0)) > 1) {
56888 SDValue ShAmt = Index.getOperand(1);
56889 SDValue NewShAmt =
56890 DAG.getNode(ISD::SUB, DL, ShAmt.getValueType(), ShAmt,
56891 DAG.getConstant(1, DL, ShAmt.getValueType()));
56892 SDValue NewIndex = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
56893 Index.getOperand(0), NewShAmt);
56894 SDValue NewScale =
56895 DAG.getConstant(ScaleAmt * 2, DL, Scale.getValueType());
56896 return rebuildGatherScatter(GorS, NewIndex, Base, NewScale, DAG);
56897 }
56898 }
56899 }
56900
56901 // Shrink indices if they are larger than 32-bits.
56902 // Only do this before legalize types since v2i64 could become v2i32.
56903 // FIXME: We could check that the type is legal if we're after legalize
56904 // types, but then we would need to construct test cases where that happens.
56905 if (IndexWidth > 32 && DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
56906 EVT NewVT = IndexVT.changeVectorElementType(MVT::i32);
56907
56908 // FIXME: We could support more than just constant fold, but we need to
56909 // careful with costing. A truncate that can be optimized out would be
56910 // fine. Otherwise we might only want to create a truncate if it avoids
56911 // a split.
56912 if (SDValue TruncIndex =
56913 DAG.FoldConstantArithmetic(ISD::TRUNCATE, DL, NewVT, Index))
56914 return rebuildGatherScatter(GorS, TruncIndex, Base, Scale, DAG);
56915
56916 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
56917 // there are sufficient sign bits. Only do this before legalize types to
56918 // avoid creating illegal types in truncate.
56919 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
56920 Index.getOpcode() == ISD::ZERO_EXTEND) &&
56921 Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
56922 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56923 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56924 }
56925
56926 // Shrink if we remove an illegal type.
56927 if (!TLI.isTypeLegal(Index.getValueType()) && TLI.isTypeLegal(NewVT)) {
56928 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56929 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56930 }
56931 }
56932 }
56933
56934 // Try to move splat adders from the index operand to the base
56935 // pointer operand. Taking care to multiply by the scale. We can only do
56936 // this when index element type is the same as the pointer type.
56937 // Otherwise we need to be sure the math doesn't wrap before the scale.
56938 if (Index.getOpcode() == ISD::ADD && IndexSVT == PtrVT &&
56939 isa<ConstantSDNode>(Scale)) {
56940 uint64_t ScaleAmt = Scale->getAsZExtVal();
56941
56942 for (unsigned I = 0; I != 2; ++I)
56943 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(I))) {
56944 BitVector UndefElts;
56945 if (SDValue Splat = BV->getSplatValue(&UndefElts)) {
56946 if (UndefElts.none()) {
56947 // If the splat value is constant we can add the scaled splat value
56948 // to the existing base.
56949 if (auto *C = dyn_cast<ConstantSDNode>(Splat)) {
56950 APInt Adder = C->getAPIntValue() * ScaleAmt;
56951 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
56952 DAG.getConstant(Adder, DL, PtrVT));
56953 SDValue NewIndex = Index.getOperand(1 - I);
56954 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56955 }
56956 // For non-constant cases, limit this to non-scaled cases.
56957 if (ScaleAmt == 1) {
56958 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base, Splat);
56959 SDValue NewIndex = Index.getOperand(1 - I);
56960 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56961 }
56962 }
56963 }
56964 // It's also possible base is just a constant. In that case, just
56965 // replace it with 0 and move the displacement into the index.
56966 if (ScaleAmt == 1 && BV->isConstant() && isa<ConstantSDNode>(Base)) {
56967 SDValue Splat = DAG.getSplatBuildVector(IndexVT, DL, Base);
56968 // Combine the constant build_vector and the constant base.
56969 Splat =
56970 DAG.getNode(ISD::ADD, DL, IndexVT, Index.getOperand(I), Splat);
56971 // Add to the other half of the original Index add.
56972 SDValue NewIndex = DAG.getNode(ISD::ADD, DL, IndexVT,
56973 Index.getOperand(1 - I), Splat);
56974 SDValue NewBase = DAG.getConstant(0, DL, PtrVT);
56975 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56976 }
56977 }
56978 }
56979
56980 if (DCI.isBeforeLegalizeOps()) {
56981 // Make sure the index is either i32 or i64
56982 if (IndexWidth != 32 && IndexWidth != 64) {
56983 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
56984 IndexVT = IndexVT.changeVectorElementType(EltVT);
56985 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
56986 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56987 }
56988 }
56989
56990 // With vector masks we only demand the upper bit of the mask.
56991 SDValue Mask = GorS->getMask();
56992 if (Mask.getScalarValueSizeInBits() != 1) {
56993 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
56994 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
56995 if (N->getOpcode() != ISD::DELETED_NODE)
56996 DCI.AddToWorklist(N);
56997 return SDValue(N, 0);
56998 }
56999 }
57000
57001 return SDValue();
57002}
57003
57004// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
57006 const X86Subtarget &Subtarget) {
57007 SDLoc DL(N);
57008 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
57009 SDValue EFLAGS = N->getOperand(1);
57010
57011 // Try to simplify the EFLAGS and condition code operands.
57012 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
57013 return getSETCC(CC, Flags, DL, DAG);
57014
57015 return SDValue();
57016}
57017
57018/// Optimize branch condition evaluation.
57020 const X86Subtarget &Subtarget) {
57021 SDLoc DL(N);
57022 SDValue EFLAGS = N->getOperand(3);
57023 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
57024
57025 // Try to simplify the EFLAGS and condition code operands.
57026 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
57027 // RAUW them under us.
57028 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
57029 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
57030 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
57031 N->getOperand(1), Cond, Flags);
57032 }
57033
57034 return SDValue();
57035}
57036
57037// TODO: Could we move this to DAGCombine?
57039 SelectionDAG &DAG) {
57040 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
57041 // to optimize away operation when it's from a constant.
57042 //
57043 // The general transformation is:
57044 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
57045 // AND(VECTOR_CMP(x,y), constant2)
57046 // constant2 = UNARYOP(constant)
57047
57048 // Early exit if this isn't a vector operation, the operand of the
57049 // unary operation isn't a bitwise AND, or if the sizes of the operations
57050 // aren't the same.
57051 EVT VT = N->getValueType(0);
57052 bool IsStrict = N->isStrictFPOpcode();
57053 unsigned NumEltBits = VT.getScalarSizeInBits();
57054 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57055 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
57056 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
57057 VT.getSizeInBits() != Op0.getValueSizeInBits())
57058 return SDValue();
57059
57060 // Now check that the other operand of the AND is a constant. We could
57061 // make the transformation for non-constant splats as well, but it's unclear
57062 // that would be a benefit as it would not eliminate any operations, just
57063 // perform one more step in scalar code before moving to the vector unit.
57064 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
57065 // Bail out if the vector isn't a constant.
57066 if (!BV->isConstant())
57067 return SDValue();
57068
57069 // Everything checks out. Build up the new and improved node.
57070 SDLoc DL(N);
57071 EVT IntVT = BV->getValueType(0);
57072 // Create a new constant of the appropriate type for the transformed
57073 // DAG.
57074 SDValue SourceConst;
57075 if (IsStrict)
57076 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
57077 {N->getOperand(0), SDValue(BV, 0)});
57078 else
57079 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
57080 // The AND node needs bitcasts to/from an integer vector type around it.
57081 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
57082 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
57083 MaskConst);
57084 SDValue Res = DAG.getBitcast(VT, NewAnd);
57085 if (IsStrict)
57086 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
57087 return Res;
57088 }
57089
57090 return SDValue();
57091}
57092
57093/// If we are converting a value to floating-point, try to replace scalar
57094/// truncate of an extracted vector element with a bitcast. This tries to keep
57095/// the sequence on XMM registers rather than moving between vector and GPRs.
57097 // TODO: This is currently only used by combineSIntToFP, but it is generalized
57098 // to allow being called by any similar cast opcode.
57099 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
57100 SDValue Trunc = N->getOperand(0);
57101 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
57102 return SDValue();
57103
57104 SDValue ExtElt = Trunc.getOperand(0);
57105 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
57106 !isNullConstant(ExtElt.getOperand(1)))
57107 return SDValue();
57108
57109 EVT TruncVT = Trunc.getValueType();
57110 EVT SrcVT = ExtElt.getValueType();
57111 unsigned DestWidth = TruncVT.getSizeInBits();
57112 unsigned SrcWidth = SrcVT.getSizeInBits();
57113 if (SrcWidth % DestWidth != 0)
57114 return SDValue();
57115
57116 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
57117 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
57118 unsigned VecWidth = SrcVecVT.getSizeInBits();
57119 unsigned NumElts = VecWidth / DestWidth;
57120 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
57121 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
57122 SDLoc DL(N);
57123 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
57124 BitcastVec, ExtElt.getOperand(1));
57125 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
57126}
57127
57129 const X86Subtarget &Subtarget) {
57130 bool IsStrict = N->isStrictFPOpcode();
57131 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57132 EVT VT = N->getValueType(0);
57133 EVT InVT = Op0.getValueType();
57134
57135 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57136 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57137 // if hasFP16 support:
57138 // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16))
57139 // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
57140 // else
57141 // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57142 // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
57143 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57144 unsigned ScalarSize = InVT.getScalarSizeInBits();
57145 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57146 ScalarSize >= 64)
57147 return SDValue();
57148 SDLoc dl(N);
57149 EVT DstVT =
57151 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57152 : ScalarSize < 32 ? MVT::i32
57153 : MVT::i64,
57154 InVT.getVectorNumElements());
57155 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57156 if (IsStrict)
57157 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57158 {N->getOperand(0), P});
57159 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57160 }
57161
57162 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
57163 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
57164 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
57165 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57166 VT.getScalarType() != MVT::f16) {
57167 SDLoc dl(N);
57168 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57169 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57170
57171 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
57172 if (IsStrict)
57173 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57174 {N->getOperand(0), P});
57175 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57176 }
57177
57178 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
57179 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
57180 // the optimization here.
57181 SDNodeFlags Flags = N->getFlags();
57182 if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) {
57183 if (IsStrict)
57184 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
57185 {N->getOperand(0), Op0});
57186 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
57187 }
57188
57189 return SDValue();
57190}
57191
57194 const X86Subtarget &Subtarget) {
57195 // First try to optimize away the conversion entirely when it's
57196 // conditionally from a constant. Vectors only.
57197 bool IsStrict = N->isStrictFPOpcode();
57199 return Res;
57200
57201 // Now move on to more general possibilities.
57202 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57203 EVT VT = N->getValueType(0);
57204 EVT InVT = Op0.getValueType();
57205
57206 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57207 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57208 // if hasFP16 support:
57209 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
57210 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
57211 // else
57212 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57213 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
57214 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57215 unsigned ScalarSize = InVT.getScalarSizeInBits();
57216 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57217 ScalarSize >= 64)
57218 return SDValue();
57219 SDLoc dl(N);
57220 EVT DstVT =
57222 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57223 : ScalarSize < 32 ? MVT::i32
57224 : MVT::i64,
57225 InVT.getVectorNumElements());
57226 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57227 if (IsStrict)
57228 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57229 {N->getOperand(0), P});
57230 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57231 }
57232
57233 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
57234 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
57235 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
57236 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57237 VT.getScalarType() != MVT::f16) {
57238 SDLoc dl(N);
57239 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57240 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57241 if (IsStrict)
57242 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57243 {N->getOperand(0), P});
57244 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57245 }
57246
57247 // Without AVX512DQ we only support i64 to float scalar conversion. For both
57248 // vectors and scalars, see if we know that the upper bits are all the sign
57249 // bit, in which case we can truncate the input to i32 and convert from that.
57250 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
57251 unsigned BitWidth = InVT.getScalarSizeInBits();
57252 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
57253 if (NumSignBits >= (BitWidth - 31)) {
57254 EVT TruncVT = MVT::i32;
57255 if (InVT.isVector())
57256 TruncVT = InVT.changeVectorElementType(TruncVT);
57257 SDLoc dl(N);
57258 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
57259 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
57260 if (IsStrict)
57261 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57262 {N->getOperand(0), Trunc});
57263 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
57264 }
57265 // If we're after legalize and the type is v2i32 we need to shuffle and
57266 // use CVTSI2P.
57267 assert(InVT == MVT::v2i64 && "Unexpected VT!");
57268 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
57269 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
57270 { 0, 2, -1, -1 });
57271 if (IsStrict)
57272 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
57273 {N->getOperand(0), Shuf});
57274 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
57275 }
57276 }
57277
57278 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
57279 // a 32-bit target where SSE doesn't support i64->FP operations.
57280 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
57281 Op0.getOpcode() == ISD::LOAD) {
57282 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
57283
57284 // This transformation is not supported if the result type is f16 or f128.
57285 if (VT == MVT::f16 || VT == MVT::f128)
57286 return SDValue();
57287
57288 // If we have AVX512DQ we can use packed conversion instructions unless
57289 // the VT is f80.
57290 if (Subtarget.hasDQI() && VT != MVT::f80)
57291 return SDValue();
57292
57293 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
57294 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
57295 std::pair<SDValue, SDValue> Tmp =
57296 Subtarget.getTargetLowering()->BuildFILD(
57297 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
57298 Ld->getPointerInfo(), Ld->getBaseAlign(), DAG);
57299 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
57300 return Tmp.first;
57301 }
57302 }
57303
57304 if (IsStrict)
57305 return SDValue();
57306
57307 if (SDValue V = combineToFPTruncExtElt(N, DAG))
57308 return V;
57309
57310 return SDValue();
57311}
57312
57314 const X86Subtarget &Subtarget) {
57315 EVT VT = N->getValueType(0);
57316 SDValue Src = N->getOperand(0);
57317 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::FRINT &&
57318 VT.getScalarType() == MVT::i32 && Src.hasOneUse())
57319 return DAG.getNode(ISD::LRINT, SDLoc(N), VT, Src.getOperand(0));
57320
57321 return SDValue();
57322}
57323
57324// Custom handling for VCVTTPS2QQS/VCVTTPS2UQQS
57326 const X86Subtarget &Subtarget) {
57327 if (!Subtarget.hasAVX10_2())
57328 return SDValue();
57329
57330 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT_SAT;
57331 EVT SrcVT = N->getOperand(0).getValueType();
57332 EVT DstVT = N->getValueType(0);
57333 SDLoc dl(N);
57334
57335 if (SrcVT == MVT::v2f32 && DstVT == MVT::v2i64) {
57336 SDValue V2F32Value = DAG.getUNDEF(SrcVT);
57337
57338 // Concatenate the original v2f32 input and V2F32Value to create v4f32
57339 SDValue NewSrc = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
57340 N->getOperand(0), V2F32Value);
57341
57342 // Select the FP_TO_SINT_SAT/FP_TO_UINT_SAT node
57343 if (IsSigned)
57344 return DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v2i64, NewSrc);
57345
57346 return DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v2i64, NewSrc);
57347 }
57348 return SDValue();
57349}
57350
57352 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57353
57354 for (const SDNode *User : Flags->users()) {
57355 X86::CondCode CC;
57356 switch (User->getOpcode()) {
57357 default:
57358 // Be conservative.
57359 return true;
57360 case X86ISD::SETCC:
57362 CC = (X86::CondCode)User->getConstantOperandVal(0);
57363 break;
57364 case X86ISD::BRCOND:
57365 case X86ISD::CMOV:
57366 CC = (X86::CondCode)User->getConstantOperandVal(2);
57367 break;
57368 }
57369
57370 switch (CC) {
57371 // clang-format off
57372 default: break;
57373 case X86::COND_A: case X86::COND_AE:
57374 case X86::COND_B: case X86::COND_BE:
57375 case X86::COND_O: case X86::COND_NO:
57376 case X86::COND_G: case X86::COND_GE:
57377 case X86::COND_L: case X86::COND_LE:
57378 return true;
57379 // clang-format on
57380 }
57381 }
57382
57383 return false;
57384}
57385
57386static bool onlyZeroFlagUsed(SDValue Flags) {
57387 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57388
57389 for (const SDNode *User : Flags->users()) {
57390 unsigned CCOpNo;
57391 switch (User->getOpcode()) {
57392 default:
57393 // Be conservative.
57394 return false;
57395 case X86ISD::SETCC:
57397 CCOpNo = 0;
57398 break;
57399 case X86ISD::BRCOND:
57400 case X86ISD::CMOV:
57401 CCOpNo = 2;
57402 break;
57403 }
57404
57405 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
57406 if (CC != X86::COND_E && CC != X86::COND_NE)
57407 return false;
57408 }
57409
57410 return true;
57411}
57412
57415 const X86Subtarget &Subtarget) {
57416 // Only handle test patterns.
57417 if (!isNullConstant(N->getOperand(1)))
57418 return SDValue();
57419
57420 // If we have a CMP of a truncated binop, see if we can make a smaller binop
57421 // and use its flags directly.
57422 // TODO: Maybe we should try promoting compares that only use the zero flag
57423 // first if we can prove the upper bits with computeKnownBits?
57424 SDLoc dl(N);
57425 SDValue Op = N->getOperand(0);
57426 EVT VT = Op.getValueType();
57427 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57428
57429 if (SDValue CMP =
57430 combineX86SubCmpForFlags(N, SDValue(N, 0), DAG, DCI, Subtarget))
57431 return CMP;
57432
57433 // If we have a constant logical shift that's only used in a comparison
57434 // against zero turn it into an equivalent AND. This allows turning it into
57435 // a TEST instruction later.
57436 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
57437 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
57438 onlyZeroFlagUsed(SDValue(N, 0))) {
57439 unsigned BitWidth = VT.getSizeInBits();
57440 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
57441 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
57442 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
57443 APInt Mask = Op.getOpcode() == ISD::SRL
57444 ? APInt::getHighBitsSet(BitWidth, MaskBits)
57445 : APInt::getLowBitsSet(BitWidth, MaskBits);
57446 if (Mask.isSignedIntN(32)) {
57447 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
57448 DAG.getConstant(Mask, dl, VT));
57449 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57450 DAG.getConstant(0, dl, VT));
57451 }
57452 }
57453 }
57454
57455 // If we're extracting from a avx512 bool vector and comparing against zero,
57456 // then try to just bitcast the vector to an integer to use TEST/BT directly.
57457 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
57458 if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
57459 Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
57460 SDValue Src = Op.getOperand(0);
57461 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
57462 isNullConstant(Src.getOperand(1)) &&
57463 Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
57464 SDValue BoolVec = Src.getOperand(0);
57465 unsigned ShAmt = 0;
57466 if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {
57467 ShAmt = BoolVec.getConstantOperandVal(1);
57468 BoolVec = BoolVec.getOperand(0);
57469 }
57470 BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);
57471 EVT VecVT = BoolVec.getValueType();
57472 unsigned BitWidth = VecVT.getVectorNumElements();
57473 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
57474 if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
57475 APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
57476 Op = DAG.getBitcast(BCVT, BoolVec);
57477 Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
57478 DAG.getConstant(Mask, dl, BCVT));
57479 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57480 DAG.getConstant(0, dl, BCVT));
57481 }
57482 }
57483 }
57484
57485 // Peek through any zero-extend if we're only testing for a zero result.
57486 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
57487 SDValue Src = Op.getOperand(0);
57488 EVT SrcVT = Src.getValueType();
57489 if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
57490 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
57491 DAG.getConstant(0, dl, SrcVT));
57492 }
57493
57494 // Look for a truncate.
57495 if (Op.getOpcode() != ISD::TRUNCATE)
57496 return SDValue();
57497
57498 SDValue Trunc = Op;
57499 Op = Op.getOperand(0);
57500
57501 // See if we can compare with zero against the truncation source,
57502 // which should help using the Z flag from many ops. Only do this for
57503 // i32 truncated op to prevent partial-reg compares of promoted ops.
57504 EVT OpVT = Op.getValueType();
57505 APInt UpperBits =
57507 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
57508 onlyZeroFlagUsed(SDValue(N, 0))) {
57509 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57510 DAG.getConstant(0, dl, OpVT));
57511 }
57512
57513 // After this the truncate and arithmetic op must have a single use.
57514 if (!Trunc.hasOneUse() || !Op.hasOneUse())
57515 return SDValue();
57516
57517 unsigned NewOpc;
57518 switch (Op.getOpcode()) {
57519 default: return SDValue();
57520 case ISD::AND:
57521 // Skip and with constant. We have special handling for and with immediate
57522 // during isel to generate test instructions.
57523 if (isa<ConstantSDNode>(Op.getOperand(1)))
57524 return SDValue();
57525 NewOpc = X86ISD::AND;
57526 break;
57527 case ISD::OR: NewOpc = X86ISD::OR; break;
57528 case ISD::XOR: NewOpc = X86ISD::XOR; break;
57529 case ISD::ADD:
57530 // If the carry or overflow flag is used, we can't truncate.
57532 return SDValue();
57533 NewOpc = X86ISD::ADD;
57534 break;
57535 case ISD::SUB:
57536 // If the carry or overflow flag is used, we can't truncate.
57538 return SDValue();
57539 NewOpc = X86ISD::SUB;
57540 break;
57541 }
57542
57543 // We found an op we can narrow. Truncate its inputs.
57544 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
57545 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
57546
57547 // Use a X86 specific opcode to avoid DAG combine messing with it.
57548 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57549 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
57550
57551 // For AND, keep a CMP so that we can match the test pattern.
57552 if (NewOpc == X86ISD::AND)
57553 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57554 DAG.getConstant(0, dl, VT));
57555
57556 // Return the flags.
57557 return Op.getValue(1);
57558}
57559
57562 const X86Subtarget &ST) {
57563 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
57564 "Expected X86ISD::ADD or X86ISD::SUB");
57565
57566 SDLoc DL(N);
57567 SDValue LHS = N->getOperand(0);
57568 SDValue RHS = N->getOperand(1);
57569 MVT VT = LHS.getSimpleValueType();
57570 bool IsSub = X86ISD::SUB == N->getOpcode();
57571 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
57572
57573 if (IsSub && isOneConstant(RHS) && !N->hasAnyUseOfValue(0))
57574 if (SDValue CMP = combineX86SubCmpForFlags(N, SDValue(N, 1), DAG, DCI, ST))
57575 return CMP;
57576
57577 // If we don't use the flag result, simplify back to a generic ADD/SUB.
57578 if (!N->hasAnyUseOfValue(1)) {
57579 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
57580 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
57581 }
57582
57583 // Fold any similar generic ADD/SUB opcodes to reuse this node.
57584 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
57585 SDValue Ops[] = {N0, N1};
57586 SDVTList VTs = DAG.getVTList(N->getValueType(0));
57587 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
57588 SDValue Op(N, 0);
57589 if (Negate) {
57590 // Bail if this is only used by a user of the x86 add/sub.
57591 if (GenericAddSub->hasOneUse() &&
57592 GenericAddSub->user_begin()->isOnlyUserOf(N))
57593 return;
57594 Op = DAG.getNegative(Op, DL, VT);
57595 }
57596 DCI.CombineTo(GenericAddSub, Op);
57597 }
57598 };
57599 MatchGeneric(LHS, RHS, false);
57600 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
57601
57602 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
57603 // EFLAGS result doesn't change.
57604 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
57605 /*ZeroSecondOpOnly*/ true);
57606}
57607
57609 SDValue LHS = N->getOperand(0);
57610 SDValue RHS = N->getOperand(1);
57611 SDValue BorrowIn = N->getOperand(2);
57612
57613 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
57614 MVT VT = N->getSimpleValueType(0);
57615 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57616 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
57617 }
57618
57619 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
57620 // iff the flag result is dead.
57621 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
57622 !N->hasAnyUseOfValue(1))
57623 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57624 LHS.getOperand(1), BorrowIn);
57625
57626 return SDValue();
57627}
57628
57629// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
57632 SDValue LHS = N->getOperand(0);
57633 SDValue RHS = N->getOperand(1);
57634 SDValue CarryIn = N->getOperand(2);
57635 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
57636 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
57637
57638 // Canonicalize constant to RHS.
57639 if (LHSC && !RHSC)
57640 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
57641 CarryIn);
57642
57643 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
57644 // the result is either zero or one (depending on the input carry bit).
57645 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
57646 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
57647 // We don't have a good way to replace an EFLAGS use, so only do this when
57648 // dead right now.
57649 SDValue(N, 1).use_empty()) {
57650 SDLoc DL(N);
57651 EVT VT = N->getValueType(0);
57652 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
57653 SDValue Res1 = DAG.getNode(
57654 ISD::AND, DL, VT,
57656 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
57657 DAG.getConstant(1, DL, VT));
57658 return DCI.CombineTo(N, Res1, CarryOut);
57659 }
57660
57661 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
57662 // iff the flag result is dead.
57663 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
57664 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
57665 SDLoc DL(N);
57666 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
57667 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
57668 DAG.getConstant(0, DL, LHS.getValueType()),
57669 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
57670 }
57671
57672 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
57673 MVT VT = N->getSimpleValueType(0);
57674 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57675 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
57676 }
57677
57678 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
57679 // iff the flag result is dead.
57680 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
57681 !N->hasAnyUseOfValue(1))
57682 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57683 LHS.getOperand(1), CarryIn);
57684
57685 return SDValue();
57686}
57687
57689 const SDLoc &DL, EVT VT,
57690 const X86Subtarget &Subtarget) {
57691 using namespace SDPatternMatch;
57692
57693 // Example of pattern we try to detect:
57694 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
57695 //(add (build_vector (extract_elt t, 0),
57696 // (extract_elt t, 2),
57697 // (extract_elt t, 4),
57698 // (extract_elt t, 6)),
57699 // (build_vector (extract_elt t, 1),
57700 // (extract_elt t, 3),
57701 // (extract_elt t, 5),
57702 // (extract_elt t, 7)))
57703
57704 if (!Subtarget.hasSSE2())
57705 return SDValue();
57706
57707 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57708 VT.getVectorNumElements() < 4 ||
57710 return SDValue();
57711
57712 SDValue Op0, Op1, Accum;
57717 m_Value(Op1))))))
57718 return SDValue();
57719
57720 // Check if one of Op0,Op1 is of the form:
57721 // (build_vector (extract_elt Mul, 0),
57722 // (extract_elt Mul, 2),
57723 // (extract_elt Mul, 4),
57724 // ...
57725 // the other is of the form:
57726 // (build_vector (extract_elt Mul, 1),
57727 // (extract_elt Mul, 3),
57728 // (extract_elt Mul, 5),
57729 // ...
57730 // and identify Mul.
57731 SDValue Mul;
57732 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
57733 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
57734 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
57735 // TODO: Be more tolerant to undefs.
57736 APInt Idx0L, Idx0H, Idx1L, Idx1H;
57737 SDValue Vec0L, Vec0H, Vec1L, Vec1H;
57738 if (!sd_match(Op0L, m_ExtractElt(m_Value(Vec0L), m_ConstInt(Idx0L))) ||
57739 !sd_match(Op0H, m_ExtractElt(m_Value(Vec0H), m_ConstInt(Idx0H))) ||
57740 !sd_match(Op1L, m_ExtractElt(m_Value(Vec1L), m_ConstInt(Idx1L))) ||
57741 !sd_match(Op1H, m_ExtractElt(m_Value(Vec1H), m_ConstInt(Idx1H))))
57742 return SDValue();
57743 // Commutativity of mul allows factors of a product to reorder.
57744 if (Idx0L.getZExtValue() > Idx1L.getZExtValue())
57745 std::swap(Idx0L, Idx1L);
57746 if (Idx0H.getZExtValue() > Idx1H.getZExtValue())
57747 std::swap(Idx0H, Idx1H);
57748 // Commutativity of add allows pairs of factors to reorder.
57749 if (Idx0L.getZExtValue() > Idx0H.getZExtValue()) {
57750 std::swap(Idx0L, Idx0H);
57751 std::swap(Idx1L, Idx1H);
57752 }
57753 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
57754 Idx1H != 2 * i + 3)
57755 return SDValue();
57756 if (!Mul) {
57757 // First time an extract_elt's source vector is visited. Must be a MUL
57758 // with 2X number of vector elements than the BUILD_VECTOR.
57759 // Both extracts must be from same MUL.
57760 Mul = Vec0L;
57761 if (Mul.getOpcode() != ISD::MUL ||
57762 Mul.getValueType().getVectorNumElements() != 2 * e)
57763 return SDValue();
57764 }
57765 // Check that the extract is from the same MUL previously seen.
57766 if (Mul != Vec0L || Mul != Vec1L || Mul != Vec0H || Mul != Vec1H)
57767 return SDValue();
57768 }
57769
57770 // Check if the Mul source can be safely shrunk.
57772 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
57774 return SDValue();
57775
57776 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57777 VT.getVectorNumElements() * 2);
57778 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
57779 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
57780
57781 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57783 EVT InVT = Ops[0].getValueType();
57784 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
57785 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57786 InVT.getVectorNumElements() / 2);
57787 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57788 };
57789 SDValue R = SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDBuilder);
57790 if (Accum)
57791 R = DAG.getNode(ISD::ADD, DL, VT, R, Accum);
57792 return R;
57793}
57794
57795// Attempt to turn this pattern into PMADDWD.
57796// (add (mul (sext (build_vector)), (sext (build_vector))),
57797// (mul (sext (build_vector)), (sext (build_vector)))
57799 const SDLoc &DL, EVT VT,
57800 const X86Subtarget &Subtarget) {
57801 using namespace SDPatternMatch;
57802
57803 if (!Subtarget.hasSSE2())
57804 return SDValue();
57805
57806 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57807 VT.getVectorNumElements() < 4 ||
57809 return SDValue();
57810
57811 // All inputs need to be sign extends.
57812 // TODO: Support ZERO_EXTEND from known positive?
57813 SDValue N00, N01, N10, N11;
57814 if (!sd_match(N, m_Add(m_Mul(m_SExt(m_Value(N00)), m_SExt(m_Value(N01))),
57815 m_Mul(m_SExt(m_Value(N10)), m_SExt(m_Value(N11))))))
57816 return SDValue();
57817
57818 // Must be extending from vXi16.
57819 EVT InVT = N00.getValueType();
57820 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
57821 N10.getValueType() != InVT || N11.getValueType() != InVT)
57822 return SDValue();
57823
57824 // All inputs should be build_vectors.
57825 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
57826 N01.getOpcode() != ISD::BUILD_VECTOR ||
57827 N10.getOpcode() != ISD::BUILD_VECTOR ||
57829 return SDValue();
57830
57831 // For each element, we need to ensure we have an odd element from one vector
57832 // multiplied by the odd element of another vector and the even element from
57833 // one of the same vectors being multiplied by the even element from the
57834 // other vector. So we need to make sure for each element i, this operator
57835 // is being performed:
57836 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
57837 SDValue In0, In1;
57838 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
57839 SDValue N00Elt = N00.getOperand(i);
57840 SDValue N01Elt = N01.getOperand(i);
57841 SDValue N10Elt = N10.getOperand(i);
57842 SDValue N11Elt = N11.getOperand(i);
57843 // TODO: Be more tolerant to undefs.
57844 SDValue N00In, N01In, N10In, N11In;
57845 APInt IdxN00, IdxN01, IdxN10, IdxN11;
57846 if (!sd_match(N00Elt, m_ExtractElt(m_Value(N00In), m_ConstInt(IdxN00))) ||
57847 !sd_match(N01Elt, m_ExtractElt(m_Value(N01In), m_ConstInt(IdxN01))) ||
57848 !sd_match(N10Elt, m_ExtractElt(m_Value(N10In), m_ConstInt(IdxN10))) ||
57849 !sd_match(N11Elt, m_ExtractElt(m_Value(N11In), m_ConstInt(IdxN11))))
57850 return SDValue();
57851 // Add is commutative so indices can be reordered.
57852 if (IdxN00.getZExtValue() > IdxN10.getZExtValue()) {
57853 std::swap(IdxN00, IdxN10);
57854 std::swap(IdxN01, IdxN11);
57855 }
57856 // N0 indices be the even element. N1 indices must be the next odd element.
57857 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i ||
57858 IdxN11 != 2 * i + 1)
57859 return SDValue();
57860
57861 // First time we find an input capture it.
57862 if (!In0) {
57863 In0 = N00In;
57864 In1 = N01In;
57865
57866 // The input vectors must be at least as wide as the output.
57867 // If they are larger than the output, we extract subvector below.
57868 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
57869 In1.getValueSizeInBits() < VT.getSizeInBits())
57870 return SDValue();
57871 }
57872 // Mul is commutative so the input vectors can be in any order.
57873 // Canonicalize to make the compares easier.
57874 if (In0 != N00In)
57875 std::swap(N00In, N01In);
57876 if (In0 != N10In)
57877 std::swap(N10In, N11In);
57878 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
57879 return SDValue();
57880 }
57881
57882 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57884 EVT OpVT = Ops[0].getValueType();
57885 assert(OpVT.getScalarType() == MVT::i16 &&
57886 "Unexpected scalar element type");
57887 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
57888 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57889 OpVT.getVectorNumElements() / 2);
57890 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57891 };
57892
57893 // If the output is narrower than an input, extract the low part of the input
57894 // vector.
57895 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57896 VT.getVectorNumElements() * 2);
57897 if (OutVT16.bitsLT(In0.getValueType())) {
57898 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
57899 DAG.getVectorIdxConstant(0, DL));
57900 }
57901 if (OutVT16.bitsLT(In1.getValueType())) {
57902 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
57903 DAG.getVectorIdxConstant(0, DL));
57904 }
57905 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
57906 PMADDBuilder);
57907}
57908
57909// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
57910// If upper element in each pair of both VPMADDWD are zero then we can merge
57911// the operand elements and use the implicit add of VPMADDWD.
57912// TODO: Add support for VPMADDUBSW (which isn't commutable).
57914 const SDLoc &DL, EVT VT) {
57915 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
57916 return SDValue();
57917
57918 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
57919 if (VT.getSizeInBits() > 128)
57920 return SDValue();
57921
57922 unsigned NumElts = VT.getVectorNumElements();
57923 MVT OpVT = N0.getOperand(0).getSimpleValueType();
57925 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
57926
57927 bool Op0HiZero =
57928 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
57929 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
57930 bool Op1HiZero =
57931 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
57932 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
57933
57934 // TODO: Check for zero lower elements once we have actual codegen that
57935 // creates them.
57936 if (!Op0HiZero || !Op1HiZero)
57937 return SDValue();
57938
57939 // Create a shuffle mask packing the lower elements from each VPMADDWD.
57940 SmallVector<int> Mask;
57941 for (int i = 0; i != (int)NumElts; ++i) {
57942 Mask.push_back(2 * i);
57943 Mask.push_back(2 * (i + NumElts));
57944 }
57945
57946 SDValue LHS =
57947 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
57948 SDValue RHS =
57949 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
57950 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
57951}
57952
57953/// CMOV of constants requires materializing constant operands in registers.
57954/// Try to fold those constants into an 'add' instruction to reduce instruction
57955/// count. We do this with CMOV rather the generic 'select' because there are
57956/// earlier folds that may be used to turn select-of-constants into logic hacks.
57958 SelectionDAG &DAG,
57959 const X86Subtarget &Subtarget) {
57960 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
57961 // better because we eliminate 1-2 instructions. This transform is still
57962 // an improvement without zero operands because we trade 2 move constants and
57963 // 1 add for 2 adds (LEA) as long as the constants can be represented as
57964 // immediate asm operands (fit in 32-bits).
57965 auto isSuitableCmov = [](SDValue V) {
57966 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
57967 return false;
57968 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
57969 !isa<ConstantSDNode>(V.getOperand(1)))
57970 return false;
57971 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
57972 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
57973 V.getConstantOperandAPInt(1).isSignedIntN(32));
57974 };
57975
57976 // Match an appropriate CMOV as the first operand of the add.
57977 SDValue Cmov = N->getOperand(0);
57978 SDValue OtherOp = N->getOperand(1);
57979 if (!isSuitableCmov(Cmov))
57980 std::swap(Cmov, OtherOp);
57981 if (!isSuitableCmov(Cmov))
57982 return SDValue();
57983
57984 // Don't remove a load folding opportunity for the add. That would neutralize
57985 // any improvements from removing constant materializations.
57986 if (X86::mayFoldLoad(OtherOp, Subtarget))
57987 return SDValue();
57988
57989 EVT VT = N->getValueType(0);
57990 SDValue FalseOp = Cmov.getOperand(0);
57991 SDValue TrueOp = Cmov.getOperand(1);
57992
57993 // We will push the add through the select, but we can potentially do better
57994 // if we know there is another add in the sequence and this is pointer math.
57995 // In that case, we can absorb an add into the trailing memory op and avoid
57996 // a 3-operand LEA which is likely slower than a 2-operand LEA.
57997 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
57998 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
57999 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
58000 all_of(N->users(), [&](SDNode *Use) {
58001 auto *MemNode = dyn_cast<MemSDNode>(Use);
58002 return MemNode && MemNode->getBasePtr().getNode() == N;
58003 })) {
58004 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
58005 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
58006 // it is possible that choosing op1 might be better.
58007 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
58008 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
58009 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
58010 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
58011 Cmov.getOperand(2), Cmov.getOperand(3));
58012 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
58013 }
58014
58015 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
58016 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
58017 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
58018 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
58019 Cmov.getOperand(3));
58020}
58021
58022// Attempt to turn ADD(MUL(x, y), acc)) -> VPMADD52L
58023// When upper 12 bits of x, y and MUL(x, y) are known to be 0
58025 EVT VT, const X86Subtarget &Subtarget) {
58026 using namespace SDPatternMatch;
58027 if (!VT.isVector() || VT.getScalarSizeInBits() != 64 ||
58028 (!Subtarget.hasAVXIFMA() && !Subtarget.hasIFMA()))
58029 return SDValue();
58030
58031 // Need AVX-512VL vector length extensions if operating on XMM/YMM registers
58032 if (!Subtarget.hasAVXIFMA() && !Subtarget.hasVLX() &&
58033 VT.getSizeInBits() < 512)
58034 return SDValue();
58035
58036 const auto TotalSize = VT.getSizeInBits();
58037 if (TotalSize < 128 || !isPowerOf2_64(TotalSize))
58038 return SDValue();
58039
58040 SDValue X, Y, Acc;
58041 if (!sd_match(N, m_Add(m_Mul(m_Value(X), m_Value(Y)), m_Value(Acc))))
58042 return SDValue();
58043
58044 KnownBits KnownX = DAG.computeKnownBits(X);
58045 if (KnownX.countMinLeadingZeros() < 12)
58046 return SDValue();
58047 KnownBits KnownY = DAG.computeKnownBits(Y);
58048 if (KnownY.countMinLeadingZeros() < 12)
58049 return SDValue();
58050 KnownBits KnownMul = KnownBits::mul(KnownX, KnownY);
58051 if (KnownMul.countMinLeadingZeros() < 12)
58052 return SDValue();
58053
58054 auto VPMADD52Builder = [](SelectionDAG &G, SDLoc DL,
58055 ArrayRef<SDValue> SubOps) {
58056 EVT SubVT = SubOps[0].getValueType();
58057 assert(SubVT.getScalarSizeInBits() == 64 &&
58058 "Unexpected element size, only supports 64bit size");
58059 return G.getNode(X86ISD::VPMADD52L, DL, SubVT, SubOps[1] /*X*/,
58060 SubOps[2] /*Y*/, SubOps[0] /*Acc*/);
58061 };
58062
58063 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Acc, X, Y}, VPMADD52Builder,
58064 /*CheckBWI*/ false,
58065 /*AllowAVX512*/ Subtarget.hasIFMA());
58066}
58067
58070 const X86Subtarget &Subtarget) {
58071 using namespace SDPatternMatch;
58072 EVT VT = N->getValueType(0);
58073 SDValue Op0 = N->getOperand(0);
58074 SDValue Op1 = N->getOperand(1);
58075 SDLoc DL(N);
58076
58077 if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget))
58078 return Select;
58079
58080 if (SDValue MAdd = matchPMADDWD(DAG, N, DL, VT, Subtarget))
58081 return MAdd;
58082 if (SDValue MAdd = matchPMADDWD_2(DAG, N, DL, VT, Subtarget))
58083 return MAdd;
58084 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
58085 return MAdd;
58086
58087 // Try to synthesize horizontal adds from adds of shuffles.
58088 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
58089 return V;
58090
58091 // Canonicalize hidden LEA pattern:
58092 // Fold (add (sub (shl x, c), y), z) -> (sub (add (shl x, c), z), y)
58093 // iff c < 4
58094 if (VT == MVT::i32 || VT == MVT::i64) {
58095 SDValue Y, Z, Shift;
58096 APInt Amt;
58097 if (sd_match(
58099 m_Shl(m_Value(), m_ConstInt(Amt))),
58100 m_Value(Y))),
58101 m_Value(Z))) &&
58102 Amt.ult(4) && !isa<ConstantSDNode>(Z)) {
58103 return DAG.getNode(ISD::SUB, DL, VT,
58104 DAG.getNode(ISD::ADD, DL, VT, Shift, Z), Y);
58105 }
58106 }
58107
58108 SDValue X, Y;
58109
58110 // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
58111 // iff X and Y won't overflow.
58112 if (sd_match(Op0, m_c_BinOp(X86ISD::PSADBW, m_Value(X), m_Zero())) &&
58114 DAG.willNotOverflowAdd(/*IsSigned=*/false, X, Y)) {
58115 MVT OpVT = X.getSimpleValueType();
58116 SDValue Sum = DAG.getNode(ISD::ADD, DL, OpVT, X, Y);
58117 return DAG.getNode(X86ISD::PSADBW, DL, VT, Sum,
58118 getZeroVector(OpVT, Subtarget, DAG, DL));
58119 }
58120
58121 if (VT.isVector()) {
58122 EVT BoolVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
58124
58125 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
58126 // (sub Y, (sext (vXi1 X))).
58127 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y)
58128 // in generic DAG combine without a legal type check, but adding this there
58129 // caused regressions.
58130 if (DAG.getTargetLoweringInfo().isTypeLegal(BoolVT) &&
58132 m_Value(Y)))) {
58133 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, X);
58134 return DAG.getNode(ISD::SUB, DL, VT, Y, SExt);
58135 }
58136
58137 // Fold (add X, (srl Y, 7)) -> (sub X, (icmp_sgt 0, Y)) to undo instcombine
58138 // canonicalisation as we don't have good vXi8 shifts.
58139 if (VT.getScalarType() == MVT::i8 &&
58141 SDValue Cmp =
58142 DAG.getSetCC(DL, BoolVT, DAG.getConstant(0, DL, VT), Y, ISD::SETGT);
58143 return DAG.getNode(ISD::SUB, DL, VT, X, DAG.getSExtOrTrunc(Cmp, DL, VT));
58144 }
58145 }
58146
58147 // Peephole for 512-bit VPDPBSSD on non-VLX targets.
58148 // TODO: Should this be part of matchPMADDWD/matchPMADDWD_2?
58149 if (Subtarget.hasVNNI() && Subtarget.useAVX512Regs() && VT == MVT::v16i32) {
58150 SDValue Accum, Lo0, Lo1, Hi0, Hi1;
58151 if (sd_match(N, m_Add(m_Value(Accum),
58154 m_Value(Lo1)),
58156 m_Value(Hi1)))))) {
58157 return DAG.getNode(X86ISD::VPDPWSSD, DL, VT, Accum,
58158 concatSubVectors(Lo0, Hi0, DAG, DL),
58159 concatSubVectors(Lo1, Hi1, DAG, DL));
58160 }
58161 }
58162
58163 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
58164 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
58165 X86::isZeroNode(Op0.getOperand(1))) {
58166 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
58167 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
58168 Op0.getOperand(0), Op0.getOperand(2));
58169 }
58170
58171 if (SDValue IFMA52 = matchVPMADD52(N, DAG, DL, VT, Subtarget))
58172 return IFMA52;
58173
58174 return combineAddOrSubToADCOrSBB(N, DL, DAG);
58175}
58176
58177// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
58178// condition comes from the subtract node that produced -X. This matches the
58179// cmov expansion for absolute value. By swapping the operands we convert abs
58180// to nabs.
58181static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1,
58182 SelectionDAG &DAG) {
58183 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
58184 return SDValue();
58185
58186 SDValue Cond = N1.getOperand(3);
58187 if (Cond.getOpcode() != X86ISD::SUB)
58188 return SDValue();
58189 assert(Cond.getResNo() == 1 && "Unexpected result number");
58190
58191 SDValue FalseOp = N1.getOperand(0);
58192 SDValue TrueOp = N1.getOperand(1);
58194
58195 // ABS condition should come from a negate operation.
58196 if ((CC == X86::COND_S || CC == X86::COND_NS) &&
58197 isNullConstant(Cond.getOperand(0))) {
58198 // Get the X and -X from the negate.
58199 SDValue NegX = Cond.getValue(0);
58200 SDValue X = Cond.getOperand(1);
58201
58202 // Cmov operands should be X and NegX. Order doesn't matter.
58203 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
58204 return SDValue();
58205
58206 // Build a new CMOV with the operands swapped.
58207 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
58208 N1.getOperand(2), Cond);
58209 // Convert sub to add.
58210 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
58211 }
58212
58213 // Handle ABD special case:
58214 // NEG(ABD(X,Y)) -> NEG(CMOV(SUB(X,Y),SUB(Y,X))) -> CMOV(SUB(Y,X),SUB(X,Y)).
58215 // ABD condition should come from a pair of matching subtracts.
58216 if ((CC == X86::COND_L || CC == X86::COND_B) && isNullConstant(N0) &&
58217 (FalseOp == Cond.getValue(0) || TrueOp == Cond.getValue(0)) &&
58218 (TrueOp.getOpcode() == ISD::SUB || TrueOp.getOpcode() == X86ISD::SUB) &&
58219 (FalseOp.getOpcode() == ISD::SUB || FalseOp.getOpcode() == X86ISD::SUB) &&
58220 (TrueOp.getOperand(0) == FalseOp.getOperand(1)) &&
58221 (TrueOp.getOperand(1) == FalseOp.getOperand(0))) {
58222 // Build a new CMOV with the operands swapped.
58223 return DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp, N1.getOperand(2),
58224 Cond);
58225 }
58226
58227 return SDValue();
58228}
58229
58231 SDValue Op0 = N->getOperand(0);
58232 SDValue Op1 = N->getOperand(1);
58233
58234 // (sub C (zero_extend (setcc)))
58235 // =>
58236 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
58237 // Don't disturb (sub 0 setcc), which is easily done with neg.
58238 EVT VT = N->getValueType(0);
58239 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
58240 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
58241 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
58242 Op1.getOperand(0).hasOneUse()) {
58243 SDValue SetCC = Op1.getOperand(0);
58246 APInt NewImm = Op0C->getAPIntValue() - 1;
58247 SDLoc DL(Op1);
58248 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
58249 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
58250 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
58251 DAG.getConstant(NewImm, DL, VT));
58252 }
58253
58254 return SDValue();
58255}
58256
58258 if (N->getConstantOperandVal(3) != X86::COND_NE)
58259 return SDValue();
58260
58261 SDValue Sub = N->getOperand(4);
58262 if (Sub.getOpcode() != X86ISD::SUB)
58263 return SDValue();
58264
58265 SDValue Op1 = Sub.getOperand(1);
58266
58267 if (!X86::isZeroNode(Sub.getOperand(0)))
58268 return SDValue();
58269
58270 SDLoc DL(N);
58271 SmallVector<SDValue, 5> Ops(N->op_values());
58272 if (Op1.getOpcode() == X86ISD::SETCC) {
58273 // res, flags2 = sub 0, (setcc cc, flag)
58274 // cload/cstore ..., cond_ne, flag2
58275 // ->
58276 // cload/cstore cc, flag
58277 Ops[3] = Op1.getOperand(0);
58278 Ops[4] = Op1.getOperand(1);
58279 } else if (Op1.getOpcode() == ISD::AND && Sub.getValue(0).use_empty()) {
58280 SDValue Src = Op1;
58281 SDValue Op10 = Op1.getOperand(0);
58282 if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1))) {
58283 // res, flags2 = sub 0, (and (xor X, -1), Y)
58284 // cload/cstore ..., cond_ne, flag2
58285 // ->
58286 // res, flags2 = sub 0, (and X, Y)
58287 // cload/cstore ..., cond_e, flag2
58288 Src = DAG.getNode(ISD::AND, DL, Op1.getValueType(), Op10.getOperand(0),
58289 Op1.getOperand(1));
58290 Ops[3] = DAG.getTargetConstant(X86::COND_E, DL, MVT::i8);
58291 }
58292 // res, flags2 = sub 0, (and X, Y)
58293 // cload/cstore ..., cc, flag2
58294 // ->
58295 // res, flags2 = cmp (and X, Y), 0
58296 // cload/cstore ..., cc, flag2
58297 Ops[4] = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Src, Sub.getOperand(0));
58298 } else {
58299 return SDValue();
58300 }
58301
58302 return DAG.getMemIntrinsicNode(N->getOpcode(), DL, N->getVTList(), Ops,
58303 cast<MemSDNode>(N)->getMemoryVT(),
58304 cast<MemSDNode>(N)->getMemOperand());
58305}
58306
58309 const X86Subtarget &Subtarget) {
58310 EVT VT = N->getValueType(0);
58311 SDValue Op0 = N->getOperand(0);
58312 SDValue Op1 = N->getOperand(1);
58313 SDLoc DL(N);
58314
58315 auto IsNonOpaqueConstant = [&](SDValue Op) {
58317 /*AllowOpaques*/ false);
58318 };
58319
58320 // X86 can't encode an immediate LHS of a sub. See if we can push the
58321 // negation into a preceding instruction. If the RHS of the sub is a XOR with
58322 // one use and a constant, invert the immediate, saving one register.
58323 // However, ignore cases where C1 is 0, as those will become a NEG.
58324 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
58325 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
58326 !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
58327 Op1->hasOneUse()) {
58328 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
58329 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
58330 SDValue NewAdd =
58331 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
58332 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
58333 }
58334
58335 if (SDValue V = combineSubABS(VT, DL, Op0, Op1, DAG))
58336 return V;
58337
58338 // Try to synthesize horizontal subs from subs of shuffles.
58339 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
58340 return V;
58341
58342 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
58343 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
58344 X86::isZeroNode(Op1.getOperand(1))) {
58345 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58346 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
58347 Op1.getOperand(0), Op1.getOperand(2));
58348 }
58349
58350 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
58351 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
58352 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
58353 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
58354 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58355 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
58356 Op1.getOperand(1), Op1.getOperand(2));
58357 return DAG.getNode(ISD::SUB, DL, VT, ADC.getValue(0), Op1.getOperand(0));
58358 }
58359
58360 if (SDValue V = combineXorSubCTLZ(N, DL, DAG, Subtarget))
58361 return V;
58362
58363 if (SDValue V = combineAddOrSubToADCOrSBB(N, DL, DAG))
58364 return V;
58365
58366 return combineSubSetcc(N, DAG);
58367}
58368
58370 const X86Subtarget &Subtarget) {
58371 unsigned Opcode = N->getOpcode();
58372 assert((Opcode == X86ISD::PCMPEQ || Opcode == X86ISD::PCMPGT) &&
58373 "Unknown PCMP opcode");
58374
58375 SDValue LHS = N->getOperand(0);
58376 SDValue RHS = N->getOperand(1);
58377 MVT VT = N->getSimpleValueType(0);
58378 unsigned EltBits = VT.getScalarSizeInBits();
58379 unsigned NumElts = VT.getVectorNumElements();
58380 SDLoc DL(N);
58381
58382 if (LHS == RHS)
58383 return (Opcode == X86ISD::PCMPEQ) ? DAG.getAllOnesConstant(DL, VT)
58384 : DAG.getConstant(0, DL, VT);
58385
58386 // Constant Folding.
58387 // PCMPEQ(X,UNDEF) -> UNDEF
58388 // PCMPGT(X,UNDEF) -> 0
58389 // PCMPGT(UNDEF,X) -> 0
58390 APInt LHSUndefs, RHSUndefs;
58391 SmallVector<APInt> LHSBits, RHSBits;
58392 if (getTargetConstantBitsFromNode(LHS, EltBits, LHSUndefs, LHSBits) &&
58393 getTargetConstantBitsFromNode(RHS, EltBits, RHSUndefs, RHSBits)) {
58394 APInt Ones = APInt::getAllOnes(EltBits);
58395 APInt Zero = APInt::getZero(EltBits);
58396 SmallVector<APInt> Results(NumElts);
58397 for (unsigned I = 0; I != NumElts; ++I) {
58398 if (Opcode == X86ISD::PCMPEQ) {
58399 Results[I] = (LHSBits[I] == RHSBits[I]) ? Ones : Zero;
58400 } else {
58401 bool AnyUndef = LHSUndefs[I] || RHSUndefs[I];
58402 Results[I] = (!AnyUndef && LHSBits[I].sgt(RHSBits[I])) ? Ones : Zero;
58403 }
58404 }
58405 if (Opcode == X86ISD::PCMPEQ)
58406 return getConstVector(Results, LHSUndefs | RHSUndefs, VT, DAG, DL);
58407 return getConstVector(Results, VT, DAG, DL);
58408 }
58409
58410 return SDValue();
58411}
58412
58413// Helper to determine if we can convert an integer comparison to a float
58414// comparison byt casting the operands.
58415static std::optional<unsigned>
58416CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS,
58417 unsigned NumSignificantBitsRHS) {
58418 MVT SVT = VT.getScalarType();
58419 assert(SVT == MVT::f32 && "Only tested for float so far");
58420 const fltSemantics &Sem = SVT.getFltSemantics();
58421 assert((CC == ISD::SETEQ || CC == ISD::SETGT) &&
58422 "Only PCMPEQ/PCMPGT currently supported");
58423
58424 // TODO: Handle bitcastable integers.
58425
58426 // For cvt + signed compare we need lhs and rhs to be exactly representable as
58427 // a fp value.
58428 unsigned FPPrec = APFloat::semanticsPrecision(Sem);
58429 if (FPPrec >= NumSignificantBitsLHS && FPPrec >= NumSignificantBitsRHS)
58430 return ISD::SINT_TO_FP;
58431
58432 return std::nullopt;
58433}
58434
58435/// Helper that combines an array of subvector ops as if they were the operands
58436/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
58437/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
58440 const X86Subtarget &Subtarget,
58441 unsigned Depth) {
58442 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
58443 unsigned EltSizeInBits = VT.getScalarSizeInBits();
58444
58445 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
58446 return DAG.getUNDEF(VT);
58447
58448 if (llvm::all_of(Ops, [](SDValue Op) {
58449 return Op.isUndef() || ISD::isBuildVectorAllZeros(Op.getNode());
58450 }))
58451 return getZeroVector(VT, Subtarget, DAG, DL);
58452
58454 return SDValue(); // Limit search depth.
58455
58456 SDValue Op0 = Ops[0];
58457 bool IsSplat = llvm::all_equal(Ops);
58458 unsigned NumOps = Ops.size();
58459 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58460 LLVMContext &Ctx = *DAG.getContext();
58461
58462 // Repeated subvectors.
58463 if (IsSplat &&
58464 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
58465 // If this broadcast is inserted into both halves, use a larger broadcast.
58466 if (Op0.getOpcode() == X86ISD::VBROADCAST)
58467 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
58468
58469 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
58470 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
58471 (Subtarget.hasAVX2() ||
58473 VT.getScalarType(), Subtarget)))
58474 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
58475 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
58476 Op0.getOperand(0),
58477 DAG.getVectorIdxConstant(0, DL)));
58478
58479 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
58480 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
58481 (Subtarget.hasAVX2() ||
58482 (EltSizeInBits >= 32 &&
58483 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
58484 Op0.getOperand(0).getValueType() == VT.getScalarType())
58485 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
58486
58487 // concat_vectors(extract_subvector(splat(x)),
58488 // extract_subvector(splat(x))) -> splat(x)
58489 // concat_vectors(extract_subvector(subv_broadcast(x)),
58490 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
58491 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58492 Op0.getOperand(0).getValueType() == VT) {
58493 SDValue SrcVec = Op0.getOperand(0);
58494 if (DAG.isSplatValue(SrcVec, /*AllowUndefs*/ false))
58495 return SrcVec;
58496 if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
58497 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
58498 return SrcVec;
58499 }
58500
58501 // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
58502 if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&
58503 !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))
58504 return DAG.getNode(Op0.getOpcode(), DL, VT,
58506 Op0.getOperand(0), Op0.getOperand(0)),
58507 Op0.getOperand(1));
58508 }
58509
58510 // TODO: This should go in combineX86ShufflesRecursively eventually.
58511 if (NumOps == 2) {
58512 SDValue Src0 = peekThroughBitcasts(Ops[0]);
58513 SDValue Src1 = peekThroughBitcasts(Ops[1]);
58514 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58516 EVT SrcVT0 = Src0.getOperand(0).getValueType();
58517 EVT SrcVT1 = Src1.getOperand(0).getValueType();
58518 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
58519 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
58520 const APInt &SrcIdx0 = Src0.getConstantOperandAPInt(1);
58521 const APInt &SrcIdx1 = Src1.getConstantOperandAPInt(1);
58522 // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
58523 // Only concat of subvector high halves which vperm2x128 is best at or if
58524 // it should fold into a subvector broadcast.
58525 if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
58526 SrcVT1.is256BitVector()) {
58527 assert((SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58528 (SrcIdx1 == 0 || SrcIdx1 == (NumSrcElts1 / 2)) &&
58529 "Bad subvector index");
58530 if ((SrcIdx0 == (NumSrcElts0 / 2) && SrcIdx1 == (NumSrcElts1 / 2)) ||
58531 (IsSplat && ISD::isNormalLoad(Src0.getOperand(0).getNode()))) {
58532 unsigned Index = 0;
58533 Index |= SrcIdx0 == 0 ? 0x00 : 0x01;
58534 Index |= SrcIdx1 == 0 ? 0x20 : 0x30;
58535 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58536 DAG.getBitcast(VT, Src0.getOperand(0)),
58537 DAG.getBitcast(VT, Src1.getOperand(0)),
58538 DAG.getTargetConstant(Index, DL, MVT::i8));
58539 }
58540 }
58541 // Widen extract_subvector
58542 // concat(extract_subvector(x,lo), extract_subvector(x,hi))
58543 // --> extract_subvector(x,lo)
58544 unsigned NumSubElts0 = Src0.getValueType().getVectorNumElements();
58545 if (Src0.getOperand(0) == Src1.getOperand(0) &&
58546 (SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58547 SrcIdx1 == (SrcIdx0 + NumSubElts0)) {
58548 return DAG.getBitcast(VT,
58550 Src0.getConstantOperandVal(1),
58551 DAG, DL, VT.getSizeInBits()));
58552 }
58553 }
58554 }
58555
58556 // Repeated opcode.
58557 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
58558 // but it currently struggles with different vector widths.
58559 if (llvm::all_of(Ops, [Op0](SDValue Op) {
58560 return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
58561 })) {
58562 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58564 for (SDValue SubOp : SubOps)
58565 Subs.push_back(SubOp.getOperand(I));
58566 // Attempt to peek through bitcasts and concat the original subvectors.
58567 EVT SubVT = peekThroughBitcasts(Subs[0]).getValueType();
58568 if (SubVT.isSimple() && SubVT.isVector()) {
58569 MVT ConcatVT =
58571 SubVT.getVectorElementCount() * Subs.size());
58572 for (SDValue &Sub : Subs)
58573 Sub = DAG.getBitcast(SubVT, Sub);
58574 if (SDValue ConcatSrc = combineConcatVectorOps(DL, ConcatVT, Subs, DAG,
58575 Subtarget, Depth + 1))
58576 return DAG.getBitcast(VT, ConcatSrc);
58577 return DAG.getBitcast(
58578 VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, Subs));
58579 }
58580 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58581 };
58582 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
58583 bool AllConstants = true;
58584 bool AllSubs = true;
58585 unsigned VecSize = VT.getSizeInBits();
58586 SDValue BC0 = peekThroughBitcasts(SubOps[0].getOperand(Op));
58587 if (isa<LoadSDNode>(BC0) && all_of(SubOps, [&](SDValue SubOp) {
58588 return BC0 == peekThroughBitcasts(SubOp.getOperand(Op));
58589 }))
58590 return true;
58591 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
58592 SDValue BC = peekThroughBitcasts(SubOps[I].getOperand(Op));
58593 unsigned SubSize = BC.getValueSizeInBits();
58594 unsigned EltSize = BC.getScalarValueSizeInBits();
58595 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58597 AllSubs &= BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58598 BC.getOperand(0).getValueSizeInBits() == VecSize &&
58599 (BC.getConstantOperandVal(1) * EltSize) == (I * SubSize);
58600 }
58601 return AllConstants || AllSubs;
58602 };
58603 auto CombineSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58604 bool AllConstants = true;
58606 for (SDValue SubOp : SubOps) {
58607 SDValue BC = peekThroughBitcasts(SubOp.getOperand(I));
58608 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58610 Subs.push_back(SubOp.getOperand(I));
58611 }
58612 if (AllConstants)
58613 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58614 return combineConcatVectorOps(DL, VT, Subs, DAG, Subtarget, Depth + 1);
58615 };
58616
58617 unsigned Opcode = Op0.getOpcode();
58618 switch (Opcode) {
58619 case ISD::BITCAST: {
58620 // TODO: Support AVX1/AVX2 bitcasts.
58622 for (SDValue SubOp : Ops)
58623 SubOps.push_back(peekThroughBitcasts(SubOp.getOperand(0)));
58624 EVT InnerVT = SubOps[0].getValueType();
58625 unsigned InnerSizeInBits = InnerVT.getScalarSizeInBits();
58626 if (!IsSplat && InnerVT.isSimple() && InnerVT.isVector() &&
58627 (Subtarget.hasBWI() ||
58628 (EltSizeInBits >= 32 && InnerSizeInBits >= 32)) &&
58629 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
58630 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58631 llvm::all_of(SubOps, [InnerVT](SDValue Op) {
58632 return Op.getValueType() == InnerVT;
58633 })) {
58634 MVT ConcatSVT = InnerVT.getScalarType().getSimpleVT();
58635 MVT ConcatVT = MVT::getVectorVT(
58636 ConcatSVT, VT.getSizeInBits() / ConcatSVT.getSizeInBits());
58637 if (SDValue ConcatSrc = combineConcatVectorOps(
58638 DL, ConcatVT, SubOps, DAG, Subtarget, Depth + 1))
58639 return DAG.getBitcast(VT, ConcatSrc);
58640 }
58641 break;
58642 }
58643 case ISD::VECTOR_SHUFFLE: {
58644 // TODO: Generalize NumOps support.
58645 if (!IsSplat && NumOps == 2 &&
58646 ((VT.is256BitVector() &&
58647 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58648 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58649 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58650 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58651 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58652 if (Concat0 || Concat1 ||
58653 (Ops[0].getOperand(0) == Ops[1].getOperand(0) &&
58654 Ops[0].getOperand(1) == Ops[1].getOperand(1) &&
58655 Subtarget.hasVBMI())) {
58656 int NumSubElts = Op0.getValueType().getVectorNumElements();
58657 SmallVector<int> NewMask;
58658 for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
58659 M = M >= NumSubElts ? M + NumSubElts : M;
58660 NewMask.push_back(M);
58661 }
58662 for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
58663 if (0 <= M)
58664 M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts;
58665 NewMask.push_back(M);
58666 }
58667 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
58668 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
58669 return DAG.getVectorShuffle(VT, DL, Concat0, Concat1, NewMask);
58670 }
58671 }
58672 break;
58673 }
58674 case X86ISD::VBROADCAST: {
58675 // TODO: 512-bit VBROADCAST concatenation.
58676 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
58677 return Op.getOperand(0).getValueType().is128BitVector();
58678 })) {
58679 if (VT == MVT::v4f64 || VT == MVT::v4i64)
58680 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
58681 ConcatSubOperand(VT, Ops, 0),
58682 ConcatSubOperand(VT, Ops, 0));
58683 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
58684 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
58685 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
58687 DL, VT, ConcatSubOperand(VT, Ops, 0),
58688 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
58689 }
58690 break;
58691 }
58692 case X86ISD::MOVDDUP:
58693 case X86ISD::MOVSHDUP:
58694 case X86ISD::MOVSLDUP: {
58695 if (!IsSplat && (VT.is256BitVector() ||
58696 (VT.is512BitVector() && Subtarget.useAVX512Regs())))
58697 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
58698 break;
58699 }
58700 case X86ISD::SHUFP: {
58701 if (!IsSplat &&
58702 (VT == MVT::v8f32 ||
58703 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) &&
58704 llvm::all_of(Ops, [Op0](SDValue Op) {
58705 return Op.getOperand(2) == Op0.getOperand(2);
58706 })) {
58707 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58708 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58709 if (Concat0 || Concat1)
58710 return DAG.getNode(Opcode, DL, VT,
58711 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58712 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
58713 Op0.getOperand(2));
58714 }
58715 break;
58716 }
58717 case X86ISD::UNPCKH:
58718 case X86ISD::UNPCKL: {
58719 // TODO: UNPCK should use CombineSubOperand
58720 // Don't concatenate build_vector patterns.
58721 if (!IsSplat &&
58722 ((VT.is256BitVector() &&
58723 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58724 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58725 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58726 none_of(Ops, [](SDValue Op) {
58727 return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
58729 peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
58731 })) {
58732 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58733 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58734 if (Concat0 || Concat1 ||
58735 (Subtarget.hasInt256() && EltSizeInBits == 64))
58736 return DAG.getNode(Opcode, DL, VT,
58737 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58738 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58739 }
58740 break;
58741 }
58742 case X86ISD::PSHUFHW:
58743 case X86ISD::PSHUFLW:
58744 case X86ISD::PSHUFD:
58745 if (!IsSplat &&
58746 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58747 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58748 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58749 llvm::all_of(Ops, [Op0](SDValue Op) {
58750 return Op.getOperand(1) == Op0.getOperand(1);
58751 })) {
58752 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58753 Op0.getOperand(1));
58754 }
58755 [[fallthrough]];
58756 case X86ISD::VPERMILPI:
58757 if (!IsSplat && EltSizeInBits == 32 &&
58758 (VT.is256BitVector() ||
58759 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58760 all_of(Ops, [&Op0](SDValue Op) {
58761 return Op0.getOperand(1) == Op.getOperand(1);
58762 })) {
58763 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
58764 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
58765 Res =
58766 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
58767 return DAG.getBitcast(VT, Res);
58768 }
58769 break;
58770 case X86ISD::VPERMILPV:
58771 if (!IsSplat && (VT.is256BitVector() ||
58772 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
58773 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58774 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58775 if (Concat0 || Concat1)
58776 return DAG.getNode(Opcode, DL, VT,
58777 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58778 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58779 }
58780 break;
58781 case X86ISD::PSHUFB:
58782 case X86ISD::PSADBW:
58783 case X86ISD::VPMADDUBSW:
58784 case X86ISD::VPMADDWD:
58785 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58786 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
58787 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
58788 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
58789 NumOps * SrcVT.getVectorNumElements());
58790 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
58791 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
58792 if (Concat0 || Concat1)
58793 return DAG.getNode(
58794 Opcode, DL, VT,
58795 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
58796 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
58797 }
58798 break;
58799 case X86ISD::VPERMV:
58800 // TODO: Handle 256-bit and NumOps == 4 cases.
58801 if (!IsSplat && NumOps == 2 &&
58802 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58803 MVT OpVT = Op0.getSimpleValueType();
58804 int NumSrcElts = OpVT.getVectorNumElements();
58805 SmallVector<int, 64> ConcatMask;
58806 for (unsigned i = 0; i != NumOps; ++i) {
58807 SmallVector<int, 64> SubMask;
58809 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58810 break;
58811 for (int M : SubMask) {
58812 if (0 <= M)
58813 M += i * NumSrcElts;
58814 ConcatMask.push_back(M);
58815 }
58816 }
58817 if (ConcatMask.size() == (NumOps * NumSrcElts))
58818 return lowerShuffleWithPERMV(DL, VT, ConcatMask,
58819 ConcatSubOperand(VT, Ops, 1),
58820 DAG.getUNDEF(VT), Subtarget, DAG);
58821 }
58822 break;
58823 case X86ISD::VPERMV3:
58824 // TODO: Handle 256-bit and NumOps == 4 cases.
58825 if (!IsSplat && NumOps == 2 &&
58826 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58827 MVT OpVT = Op0.getSimpleValueType();
58828 int NumSrcElts = OpVT.getVectorNumElements();
58829 SmallVector<int, 64> ConcatMask;
58830 for (unsigned i = 0; i != NumOps; ++i) {
58831 SmallVector<int, 64> SubMask;
58833 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58834 break;
58835 for (int M : SubMask) {
58836 if (0 <= M) {
58837 int Src = M < NumSrcElts ? 0 : 2;
58838 M += M < NumSrcElts ? 0 : NumSrcElts;
58839
58840 // Reference the lowest sub if the upper sub is the same.
58841 if (Ops[0].getOperand(Src) != Ops[i].getOperand(Src))
58842 M += i * NumSrcElts;
58843 }
58844 ConcatMask.push_back(M);
58845 }
58846 }
58847 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
58848 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58849 SDValue Concat1 = CombineSubOperand(VT, Ops, 2);
58850 if (Concat0 || Concat1)
58851 return lowerShuffleWithPERMV(
58852 DL, VT, ConcatMask,
58853 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58854 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 2), Subtarget,
58855 DAG);
58856 }
58857 }
58858 break;
58859 case X86ISD::VPERM2X128: {
58860 if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
58861 assert(NumOps == 2 && "Bad concat_vectors operands");
58862 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58863 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58864 // TODO: Handle zero'd subvectors.
58865 if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
58866 int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
58867 (int)((Imm1 >> 4) & 0x3)};
58868 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
58869 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58870 Ops[0].getOperand(1), DAG, DL);
58871 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58872 Ops[1].getOperand(1), DAG, DL);
58873 SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
58874 DAG.getBitcast(ShuffleVT, LHS),
58875 DAG.getBitcast(ShuffleVT, RHS),
58876 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
58877 return DAG.getBitcast(VT, Res);
58878 }
58879 }
58880 break;
58881 }
58882 case X86ISD::SHUF128: {
58883 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
58884 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58885 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58886 unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |
58887 ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;
58888 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58889 Ops[0].getOperand(1), DAG, DL);
58890 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58891 Ops[1].getOperand(1), DAG, DL);
58892 return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,
58893 DAG.getTargetConstant(Imm, DL, MVT::i8));
58894 }
58895 break;
58896 }
58897 case ISD::TRUNCATE:
58898 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
58899 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58900 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
58901 SrcVT == Ops[1].getOperand(0).getValueType() &&
58902 Subtarget.useAVX512Regs() &&
58903 Subtarget.getPreferVectorWidth() >= 512 &&
58904 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
58905 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58906 return DAG.getNode(ISD::TRUNCATE, DL, VT,
58907 ConcatSubOperand(NewSrcVT, Ops, 0));
58908 }
58909 }
58910 break;
58911 case ISD::ANY_EXTEND:
58912 case ISD::SIGN_EXTEND:
58913 case ISD::ZERO_EXTEND:
58914 // TODO: Handle ANY_EXTEND combos with SIGN/ZERO_EXTEND.
58915 if (!IsSplat && NumOps == 2 &&
58916 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58917 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58918 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58919 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58920 if (SrcVT.isSimple() && SrcVT.is128BitVector() &&
58921 SrcVT == Ops[1].getOperand(0).getValueType()) {
58922 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58923 return DAG.getNode(Opcode, DL, VT,
58924 ConcatSubOperand(NewSrcVT, Ops, 0));
58925 }
58926 }
58927 break;
58931 // TODO: Handle ANY_EXTEND_INREG combos with SIGN/ZERO_EXTEND_INREG.
58932 if (!IsSplat && NumOps == 2 &&
58933 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58934 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58935 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58937 Op0.getOperand(0).getValueType() ==
58938 Ops[0].getOperand(0).getValueType()) {
58939 EVT SrcVT = Op0.getOperand(0).getValueType();
58940 unsigned NumElts = VT.getVectorNumElements();
58941 MVT UnpackSVT =
58942 MVT::getIntegerVT(SrcVT.getScalarSizeInBits() * (NumElts / 2));
58943 MVT UnpackVT =
58944 MVT::getVectorVT(UnpackSVT, 128 / UnpackSVT.getScalarSizeInBits());
58945 SDValue Unpack =
58946 DAG.getNode(X86ISD::UNPCKL, DL, UnpackVT,
58947 DAG.getBitcast(UnpackVT, Ops[0].getOperand(0)),
58948 DAG.getBitcast(UnpackVT, Ops[1].getOperand(0)));
58949 return getEXTEND_VECTOR_INREG(Opcode, DL, VT,
58950 DAG.getBitcast(SrcVT, Unpack), DAG);
58951 }
58952 break;
58953 }
58954 case X86ISD::VSHLI:
58955 case X86ISD::VSRLI:
58956 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
58957 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
58958 llvm::all_of(Ops, [](SDValue Op) {
58959 return Op.getConstantOperandAPInt(1) == 32;
58960 })) {
58961 if (SDValue Res = CombineSubOperand(VT, Ops, 0)) {
58962 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
58963 Res = DAG.getBitcast(MVT::v8i32, Res);
58964 if (Opcode == X86ISD::VSHLI) {
58965 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
58966 {8, 0, 8, 2, 8, 4, 8, 6});
58967 } else {
58968 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
58969 {1, 8, 3, 8, 5, 8, 7, 8});
58970 }
58971 return DAG.getBitcast(VT, Res);
58972 }
58973 }
58974 [[fallthrough]];
58975 case X86ISD::VSRAI:
58976 case X86ISD::VSHL:
58977 case X86ISD::VSRL:
58978 case X86ISD::VSRA:
58979 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
58980 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58981 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58982 llvm::all_of(Ops, [Op0](SDValue Op) {
58983 return Op0.getOperand(1) == Op.getOperand(1);
58984 })) {
58985 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58986 Op0.getOperand(1));
58987 }
58988 break;
58989 case X86ISD::VPERMI:
58990 case X86ISD::VROTLI:
58991 case X86ISD::VROTRI:
58992 if (!IsSplat &&
58993 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
58994 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58995 llvm::all_of(Ops, [Op0](SDValue Op) {
58996 return Op0.getOperand(1) == Op.getOperand(1);
58997 })) {
58998 assert(!(Opcode == X86ISD::VPERMI &&
58999 Op0.getValueType().is128BitVector()) &&
59000 "Illegal 128-bit X86ISD::VPERMI nodes");
59001 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59002 Op0.getOperand(1));
59003 }
59004 break;
59005 case ISD::AND:
59006 case ISD::OR:
59007 case ISD::XOR:
59008 case X86ISD::ANDNP:
59009 // TODO: AVX512 targets should only use CombineSubOperand like AVX1/2.
59010 if (!IsSplat && (VT.is256BitVector() ||
59011 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59012 // Don't concatenate root AVX1 NOT patterns.
59013 // TODO: Allow NOT folding if Concat0 succeeds.
59014 if (Opcode == ISD::XOR && Depth == 0 && !Subtarget.hasInt256() &&
59015 llvm::all_of(Ops, [](SDValue X) {
59016 return ISD::isBuildVectorAllOnes(X.getOperand(1).getNode());
59017 }))
59018 break;
59019 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59020 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59021 if (Concat0 || Concat1 || Subtarget.useAVX512Regs())
59022 return DAG.getNode(Opcode, DL, VT,
59023 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59024 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59025 }
59026 break;
59027 case X86ISD::PCMPEQ:
59028 case X86ISD::PCMPGT:
59029 // TODO: 512-bit PCMPEQ/PCMPGT -> VPCMP+VPMOVM2 handling.
59030 if (!IsSplat && VT.is256BitVector() && Subtarget.hasInt256()) {
59031 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59032 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59033 if (Concat0 || Concat1)
59034 return DAG.getNode(Opcode, DL, VT,
59035 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59036 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59037 break;
59038 }
59039
59040 if (!IsSplat && VT == MVT::v8i32) {
59041 // Without AVX2, see if we can cast the values to v8f32 and use fcmp.
59042 // TODO: Handle v4f64 as well?
59043 unsigned MaxSigBitsLHS = 0, MaxSigBitsRHS = 0;
59044 for (unsigned I = 0; I != NumOps; ++I) {
59045 MaxSigBitsLHS =
59046 std::max(MaxSigBitsLHS,
59047 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(0)));
59048 MaxSigBitsRHS =
59049 std::max(MaxSigBitsRHS,
59050 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(1)));
59051 if (MaxSigBitsLHS == EltSizeInBits && MaxSigBitsRHS == EltSizeInBits)
59052 break;
59053 }
59054
59055 ISD::CondCode ICC =
59056 Opcode == X86ISD::PCMPEQ ? ISD::SETEQ : ISD::SETGT;
59057 ISD::CondCode FCC =
59059
59060 MVT FpSVT = MVT::getFloatingPointVT(EltSizeInBits);
59061 MVT FpVT = VT.changeVectorElementType(FpSVT);
59062
59063 if (std::optional<unsigned> CastOpc =
59064 CastIntSETCCtoFP(FpVT, ICC, MaxSigBitsLHS, MaxSigBitsRHS)) {
59065 SDValue LHS = CombineSubOperand(VT, Ops, 0);
59066 SDValue RHS = CombineSubOperand(VT, Ops, 1);
59067 LHS = LHS ? LHS : ConcatSubOperand(VT, Ops, 0);
59068 RHS = RHS ? RHS : ConcatSubOperand(VT, Ops, 1);
59069 LHS = DAG.getNode(*CastOpc, DL, FpVT, LHS);
59070 RHS = DAG.getNode(*CastOpc, DL, FpVT, RHS);
59071
59072 bool IsAlwaysSignaling;
59073 unsigned FSETCC =
59074 translateX86FSETCC(FCC, LHS, RHS, IsAlwaysSignaling);
59075 return DAG.getBitcast(
59076 VT, DAG.getNode(X86ISD::CMPP, DL, FpVT, LHS, RHS,
59077 DAG.getTargetConstant(FSETCC, DL, MVT::i8)));
59078 }
59079 }
59080 break;
59081 case ISD::CTPOP:
59082 case ISD::CTTZ:
59083 case ISD::CTLZ:
59086 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59087 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
59088 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
59089 }
59090 break;
59092 // TODO: GF2P8AFFINEQB should use CombineSubOperand.
59093 if (!IsSplat &&
59094 (VT.is256BitVector() ||
59095 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59096 llvm::all_of(Ops, [Op0](SDValue Op) {
59097 return Op0.getOperand(2) == Op.getOperand(2);
59098 })) {
59099 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59100 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
59101 }
59102 break;
59103 case ISD::ADD:
59104 case ISD::SUB:
59105 case ISD::MUL:
59106 // TODO: Add more integer binops?
59107 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59108 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
59109 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
59110 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59111 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59112 if (Concat0 || Concat1 || llvm::all_of(Ops, [](SDValue Op) {
59113 return Op.getOperand(0) == Op.getOperand(1);
59114 }))
59115 return DAG.getNode(Opcode, DL, VT,
59116 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59117 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59118 }
59119 break;
59120 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
59121 // their latency are short, so here we don't replace them unless we won't
59122 // introduce extra VINSERT.
59123 case ISD::FADD:
59124 case ISD::FSUB:
59125 case ISD::FMUL:
59126 if (!IsSplat && (VT.is256BitVector() ||
59127 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59128 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59129 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59130 if (Concat0 || Concat1)
59131 return DAG.getNode(Opcode, DL, VT,
59132 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59133 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59134 }
59135 break;
59136 // Always prefer to concatenate high latency FDIV instructions.
59137 case ISD::FDIV:
59138 if (!IsSplat && (VT.is256BitVector() ||
59139 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59140 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59141 ConcatSubOperand(VT, Ops, 1));
59142 }
59143 break;
59144 case X86ISD::HADD:
59145 case X86ISD::HSUB:
59146 case X86ISD::FHADD:
59147 case X86ISD::FHSUB:
59148 if (!IsSplat && VT.is256BitVector() &&
59149 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
59150 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59151 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59152 if (Concat0 || Concat1)
59153 return DAG.getNode(Opcode, DL, VT,
59154 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59155 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59156 }
59157 break;
59158 case X86ISD::PACKSS:
59159 case X86ISD::PACKUS:
59160 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59161 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
59162 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
59163 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
59164 NumOps * SrcVT.getVectorNumElements());
59165 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
59166 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
59167 if (Concat0 || Concat1)
59168 return DAG.getNode(
59169 Opcode, DL, VT,
59170 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
59171 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
59172 }
59173 break;
59174 case X86ISD::VSHLD:
59175 case X86ISD::VSHRD:
59176 case X86ISD::PALIGNR:
59177 if (!IsSplat &&
59178 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59179 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
59180 llvm::all_of(Ops, [Op0](SDValue Op) {
59181 return Op0.getOperand(2) == Op.getOperand(2);
59182 })) {
59183 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59184 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59185 if (Concat0 || Concat1)
59186 return DAG.getNode(Opcode, DL, VT,
59187 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59188 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59189 Op0.getOperand(2));
59190 }
59191 break;
59192 case X86ISD::BLENDI:
59193 if (VT.is256BitVector() && NumOps == 2 &&
59194 (EltSizeInBits >= 32 ||
59195 (Subtarget.hasInt256() &&
59196 Ops[0].getOperand(2) == Ops[1].getOperand(2)))) {
59197 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59198 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59199 if (Concat0 || Concat1) {
59200 unsigned NumElts = VT.getVectorNumElements();
59201 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59202 Mask.insertBits(getBLENDIBlendMask(Ops[1]), NumElts / 2);
59203 Mask = Mask.zextOrTrunc(8);
59204 return DAG.getNode(Opcode, DL, VT,
59205 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59206 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59207 DAG.getTargetConstant(Mask, DL, MVT::i8));
59208 }
59209 }
59210 // TODO: BWI targets should only use CombineSubOperand.
59211 if (((VT.is256BitVector() && Subtarget.hasVLX()) ||
59212 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59213 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())) {
59214 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59215 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59216 if (Concat0 || Concat1 || Subtarget.useBWIRegs()) {
59217 unsigned NumElts = VT.getVectorNumElements();
59218 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59219 for (unsigned I = 1; I != NumOps; ++I)
59220 Mask.insertBits(getBLENDIBlendMask(Ops[I]), I * (NumElts / NumOps));
59221 unsigned NumMaskBits = NumElts >= 8 ? NumElts : 8;
59222 Mask = Mask.zextOrTrunc(NumMaskBits);
59223 MVT MaskSVT = MVT::getIntegerVT(NumMaskBits);
59224 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumMaskBits);
59225 SDValue Sel =
59226 DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));
59227 Sel = extractSubVector(Sel, 0, DAG, DL, NumElts);
59228 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
59229 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
59230 return DAG.getSelect(DL, VT, Sel, Concat1, Concat0);
59231 }
59232 }
59233 break;
59234 case ISD::VSELECT:
59235 // TODO: VSELECT should use CombineSubOperand.
59236 if (!IsSplat && Subtarget.hasAVX512() &&
59237 (VT.is256BitVector() ||
59238 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59239 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
59240 EVT SelVT = Ops[0].getOperand(0).getValueType();
59241 if (SelVT.getVectorElementType() == MVT::i1) {
59242 SelVT = EVT::getVectorVT(Ctx, MVT::i1,
59243 NumOps * SelVT.getVectorNumElements());
59244 if (TLI.isTypeLegal(SelVT))
59245 return DAG.getNode(
59246 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59247 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59248 }
59249 }
59250 [[fallthrough]];
59251 case X86ISD::BLENDV:
59252 // TODO: BLENDV should use CombineSubOperand.
59253 if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&
59254 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
59255 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
59256 EVT SelVT = Ops[0].getOperand(0).getValueType();
59257 SelVT = SelVT.getDoubleNumVectorElementsVT(Ctx);
59258 if (TLI.isTypeLegal(SelVT))
59259 return DAG.getNode(
59260 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59261 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59262 }
59263 break;
59264 }
59265 }
59266
59267 // Fold subvector loads into one.
59268 // If needed, look through bitcasts to get to the load.
59269 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
59270 unsigned Fast;
59271 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
59272 if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
59273 *FirstLd->getMemOperand(), &Fast) &&
59274 Fast) {
59275 if (SDValue Ld =
59276 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
59277 return Ld;
59278 }
59279 }
59280
59281 // Attempt to fold target constant loads.
59282 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
59283 SmallVector<APInt> EltBits;
59284 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
59285 for (unsigned I = 0; I != NumOps; ++I) {
59286 APInt OpUndefElts;
59287 SmallVector<APInt> OpEltBits;
59288 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
59289 OpEltBits, /*AllowWholeUndefs*/ true,
59290 /*AllowPartialUndefs*/ false))
59291 break;
59292 EltBits.append(OpEltBits);
59293 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
59294 }
59295 if (EltBits.size() == VT.getVectorNumElements()) {
59296 Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx);
59297 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
59298 SDValue CV = DAG.getConstantPool(C, PVT);
59301 SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI);
59302 SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits());
59304 return Ld;
59305 }
59306 }
59307
59308 // If this simple subvector or scalar/subvector broadcast_load is inserted
59309 // into both halves, use a larger broadcast_load. Update other uses to use
59310 // an extracted subvector.
59311 if (IsSplat &&
59312 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
59313 if (ISD::isNormalLoad(Op0.getNode()) ||
59316 auto *Mem = cast<MemSDNode>(Op0);
59317 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
59320 if (SDValue BcastLd =
59321 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
59322 SDValue BcastSrc =
59323 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
59324 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
59325 return BcastLd;
59326 }
59327 }
59328 }
59329
59330 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
59331 if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
59332 Subtarget.useAVX512Regs()) {
59333 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
59334 SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
59335 Res = DAG.getBitcast(ShuffleVT, Res);
59336 Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,
59337 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
59338 return DAG.getBitcast(VT, Res);
59339 }
59340
59341 // We can always convert per-lane vXf64 shuffles into VSHUFPD.
59342 if (!IsSplat &&
59343 ((NumOps == 2 && VT == MVT::v4f64) ||
59344 (NumOps == 4 && VT == MVT::v8f64 && Subtarget.useAVX512Regs())) &&
59345 all_of(Ops, [](SDValue Op) { return Op.hasOneUse(); })) {
59346 // Collect the individual per-lane v2f64/v4f64 shuffles.
59347 MVT OpVT = Ops[0].getSimpleValueType();
59348 unsigned NumOpElts = OpVT.getVectorNumElements();
59351 if (all_of(seq<int>(NumOps), [&](int I) {
59352 return getTargetShuffleInputs(Ops[I], SrcOps[I], SrcMasks[I], DAG,
59353 Depth + 1) &&
59354 !is128BitLaneCrossingShuffleMask(OpVT, SrcMasks[I]) &&
59355 none_of(SrcMasks[I], isUndefOrZero) &&
59356 SrcMasks[I].size() == NumOpElts &&
59357 all_of(SrcOps[I], [&OpVT](SDValue V) {
59358 return V.getValueType() == OpVT;
59359 });
59360 })) {
59361 // Concatenate the shuffle masks into SHUFPD mask and collect subops.
59362 bool Unary = true;
59363 unsigned SHUFPDMask = 0;
59365 for (unsigned I = 0; I != NumOps; ++I) {
59366 LHS[I] = SrcOps[I][SrcMasks[I][0] / NumOpElts];
59367 RHS[I] = SrcOps[I][SrcMasks[I][1] / NumOpElts];
59368 Unary &= LHS[I] == RHS[I];
59369 for (unsigned J = 0; J != NumOpElts; ++J)
59370 SHUFPDMask |= (SrcMasks[I][J] & 1) << ((I * NumOpElts) + J);
59371 }
59372 // Concat SHUFPD LHS/RHS operands - if they match then it will become a
59373 // PERMILPD mask and we can always profitably concatenate them.
59374 SDValue Concat0 =
59375 combineConcatVectorOps(DL, VT, LHS, DAG, Subtarget, Depth + 1);
59376 SDValue Concat1 =
59377 combineConcatVectorOps(DL, VT, RHS, DAG, Subtarget, Depth + 1);
59378 if (Unary || Concat0 || Concat1) {
59379 Concat0 =
59380 Concat0 ? Concat0 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS);
59381 Concat1 =
59382 Concat1 ? Concat1 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS);
59383 return DAG.getNode(X86ISD::SHUFP, DL, VT, Concat0, Concat1,
59384 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
59385 }
59386 }
59387 }
59388
59389 return SDValue();
59390}
59391
59394 const X86Subtarget &Subtarget) {
59395 EVT VT = N->getValueType(0);
59396 EVT SrcVT = N->getOperand(0).getValueType();
59397 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59399
59400 if (VT.getVectorElementType() == MVT::i1) {
59401 // Attempt to constant fold.
59402 unsigned SubSizeInBits = SrcVT.getSizeInBits();
59404 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
59406 if (!C) break;
59407 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
59408 if (I == (E - 1)) {
59409 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
59410 if (TLI.isTypeLegal(IntVT))
59411 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
59412 }
59413 }
59414
59415 // Don't do anything else for i1 vectors.
59416 return SDValue();
59417 }
59418
59419 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
59420 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
59421 Subtarget))
59422 return R;
59423 }
59424
59425 return SDValue();
59426}
59427
59430 const X86Subtarget &Subtarget) {
59431 if (DCI.isBeforeLegalizeOps())
59432 return SDValue();
59433
59434 MVT OpVT = N->getSimpleValueType(0);
59435
59436 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
59437
59438 SDLoc dl(N);
59439 SDValue Vec = N->getOperand(0);
59440 SDValue SubVec = N->getOperand(1);
59441
59442 uint64_t IdxVal = N->getConstantOperandVal(2);
59443 MVT SubVecVT = SubVec.getSimpleValueType();
59444 int VecNumElts = OpVT.getVectorNumElements();
59445 int SubVecNumElts = SubVecVT.getVectorNumElements();
59446
59447 if (Vec.isUndef() && SubVec.isUndef())
59448 return DAG.getUNDEF(OpVT);
59449
59450 // Inserting undefs/zeros into zeros/undefs is a zero vector.
59451 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
59452 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
59453 return getZeroVector(OpVT, Subtarget, DAG, dl);
59454
59456 // If we're inserting into a zero vector and then into a larger zero vector,
59457 // just insert into the larger zero vector directly.
59458 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59460 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
59461 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59462 getZeroVector(OpVT, Subtarget, DAG, dl),
59463 SubVec.getOperand(1),
59464 DAG.getVectorIdxConstant(IdxVal + Idx2Val, dl));
59465 }
59466
59467 // If we're inserting into a zero vector and our input was extracted from an
59468 // insert into a zero vector of the same type and the extraction was at
59469 // least as large as the original insertion. Just insert the original
59470 // subvector into a zero vector.
59471 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
59472 isNullConstant(SubVec.getOperand(1)) &&
59474 SDValue Ins = SubVec.getOperand(0);
59475 if (isNullConstant(Ins.getOperand(2)) &&
59476 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
59477 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
59478 SubVecVT.getFixedSizeInBits())
59479 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59480 getZeroVector(OpVT, Subtarget, DAG, dl),
59481 Ins.getOperand(1), N->getOperand(2));
59482 }
59483 }
59484
59485 // Stop here if this is an i1 vector.
59486 if (IsI1Vector)
59487 return SDValue();
59488
59489 // Eliminate an intermediate vector widening:
59490 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
59491 // insert_subvector X, Y, Idx
59492 // TODO: This is a more general version of a DAGCombiner fold, can we move it
59493 // there?
59494 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59495 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
59496 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
59497 SubVec.getOperand(1), N->getOperand(2));
59498
59499 // If this is an insert of an extract, combine to a shuffle. Don't do this
59500 // if the insert or extract can be represented with a subregister operation.
59501 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59502 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
59503 (IdxVal != 0 ||
59504 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
59505 SDValue ExtSrc = SubVec.getOperand(0);
59506 int ExtIdxVal = SubVec.getConstantOperandVal(1);
59507 // Create a shuffle mask matching the extraction and insertion.
59508 SmallVector<int, 64> Mask(VecNumElts);
59509 std::iota(Mask.begin(), Mask.end(), 0);
59510 std::iota(Mask.begin() + IdxVal, Mask.begin() + IdxVal + SubVecNumElts,
59511 ExtIdxVal + VecNumElts);
59512 if (ExtIdxVal != 0)
59513 return DAG.getVectorShuffle(OpVT, dl, Vec, ExtSrc, Mask);
59514 // See if we can use a blend instead of extract/insert pair.
59515 SmallVector<int, 64> BlendMask(VecNumElts);
59516 std::iota(BlendMask.begin(), BlendMask.end(), 0);
59517 std::iota(BlendMask.begin() + IdxVal,
59518 BlendMask.begin() + IdxVal + SubVecNumElts, VecNumElts + IdxVal);
59519 if (isShuffleEquivalent(Mask, BlendMask, Vec, ExtSrc) &&
59520 VecNumElts == (2 * SubVecNumElts)) {
59521 assert((IdxVal % SubVecNumElts) == 0 && "Unaligned subvector insertion");
59522 if (OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
59523 SDValue Blend = DAG.getNode(
59524 X86ISD::BLENDI, dl, MVT::v8f32, DAG.getBitcast(MVT::v8f32, Vec),
59525 DAG.getBitcast(MVT::v8f32, ExtSrc),
59526 DAG.getTargetConstant(IdxVal == 0 ? 0x0F : 0xF0, dl, MVT::i8));
59527 return DAG.getBitcast(OpVT, Blend);
59528 } else if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) {
59529 MVT ShufVT = OpVT.isInteger() ? MVT::v8i64 : MVT::v8f64;
59530 SDValue Lo = DAG.getBitcast(ShufVT, IdxVal == 0 ? ExtSrc : Vec);
59531 SDValue Hi = DAG.getBitcast(ShufVT, IdxVal == 0 ? Vec : ExtSrc);
59532 SDValue Shuffle =
59533 DAG.getNode(X86ISD::SHUF128, dl, ShufVT, Lo, Hi,
59534 getV4X86ShuffleImm8ForMask({0, 1, 2, 3}, dl, DAG));
59535 return DAG.getBitcast(OpVT, Shuffle);
59536 }
59537 }
59538 }
59539
59540 // Match concat_vector style patterns.
59541 SmallVector<SDValue, 2> SubVectorOps;
59542 if (collectConcatOps(N, SubVectorOps, DAG)) {
59543 if (SDValue Fold =
59544 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, Subtarget))
59545 return Fold;
59546
59547 // If we're inserting all zeros into the upper half, change this to
59548 // a concat with zero. We will match this to a move
59549 // with implicit upper bit zeroing during isel.
59550 // We do this here because we don't want combineConcatVectorOps to
59551 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
59552 if (SubVectorOps.size() == 2 &&
59553 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
59554 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59555 getZeroVector(OpVT, Subtarget, DAG, dl),
59556 SubVectorOps[0], DAG.getVectorIdxConstant(0, dl));
59557
59558 // Attempt to recursively combine to a shuffle.
59559 if (all_of(SubVectorOps, [](SDValue SubOp) {
59561 })) {
59562 SDValue Op(N, 0);
59563 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59564 return Res;
59565 }
59566 }
59567
59568 // If this is a broadcast insert into an upper undef, use a larger broadcast.
59569 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
59570 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
59571
59572 // If this is a broadcast load inserted into an upper undef, use a larger
59573 // broadcast load.
59574 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
59575 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
59576 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
59578 MemIntr->getMemoryVT(), MemIntr, 0, DAG);
59579 }
59580
59581 // If we're splatting the lower half subvector of a full vector load into the
59582 // upper half, attempt to create a subvector broadcast.
59583 if ((int)IdxVal == (VecNumElts / 2) &&
59584 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
59585 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
59586 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
59587 if (VecLd && SubLd &&
59589 SubLd, VecLd, SubVec.getValueSizeInBits() / 8, 0)) {
59591 SubVecVT, SubLd, 0, DAG);
59592 SDValue NewSubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT,
59593 BcastLd, DAG.getVectorIdxConstant(0, dl));
59594 DCI.CombineTo(SubLd, NewSubVec, BcastLd.getValue(1));
59595 return BcastLd;
59596 }
59597 }
59598
59599 // Attempt to constant fold (if we're not widening).
59600 if (!Vec.isUndef() && !ISD::isBuildVectorAllZeros(Vec.getNode())) {
59601 unsigned EltSizeInBits = OpVT.getScalarSizeInBits();
59602 APInt VecUndefElts, SubUndefElts;
59603 SmallVector<APInt, 16> VecEltBits, SubEltBits;
59604 if (getTargetConstantBitsFromNode(Vec, EltSizeInBits, VecUndefElts,
59605 VecEltBits) &&
59606 getTargetConstantBitsFromNode(SubVec, EltSizeInBits, SubUndefElts,
59607 SubEltBits)) {
59608 VecUndefElts.insertBits(SubUndefElts, IdxVal);
59609 llvm::copy(SubEltBits, VecEltBits.begin() + IdxVal);
59610 return getConstVector(VecEltBits, VecUndefElts, OpVT, DAG, dl);
59611 }
59612 }
59613
59614 // Attempt to recursively combine to a shuffle.
59617 SDValue Op(N, 0);
59618 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59619 return Res;
59620 }
59621
59622 // Match insertion of subvector load that perfectly aliases a base load.
59623 if ((IdxVal % SubVecNumElts) == 0 && ISD::isNormalLoad(Vec.getNode()) &&
59624 ISD::isNormalLoad(SubVec.getNode()) &&
59626 cast<LoadSDNode>(SubVec), cast<LoadSDNode>(Vec),
59627 SubVec.getValueSizeInBits() / 8, IdxVal / SubVecNumElts))
59628 return Vec;
59629
59630 return SDValue();
59631}
59632
59633/// If we are extracting a subvector of a vector select and the select condition
59634/// is composed of concatenated vectors, try to narrow the select width. This
59635/// is a common pattern for AVX1 integer code because 256-bit selects may be
59636/// legal, but there is almost no integer math/logic available for 256-bit.
59637/// This function should only be called with legal types (otherwise, the calls
59638/// to get simple value types will assert).
59640 SelectionDAG &DAG) {
59641 SDValue Sel = Ext->getOperand(0);
59642 if (Sel.getOpcode() != ISD::VSELECT ||
59643 !isFreeToSplitVector(Sel.getOperand(0), DAG))
59644 return SDValue();
59645
59646 // Note: We assume simple value types because this should only be called with
59647 // legal operations/types.
59648 // TODO: This can be extended to handle extraction to 256-bits.
59649 MVT VT = Ext->getSimpleValueType(0);
59650 if (!VT.is128BitVector())
59651 return SDValue();
59652
59653 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
59654 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
59655 return SDValue();
59656
59657 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
59658 MVT SelVT = Sel.getSimpleValueType();
59659 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
59660 "Unexpected vector type with legal operations");
59661
59662 unsigned SelElts = SelVT.getVectorNumElements();
59663 unsigned CastedElts = WideVT.getVectorNumElements();
59664 unsigned ExtIdx = Ext->getConstantOperandVal(1);
59665 if (SelElts % CastedElts == 0) {
59666 // The select has the same or more (narrower) elements than the extract
59667 // operand. The extraction index gets scaled by that factor.
59668 ExtIdx *= (SelElts / CastedElts);
59669 } else if (CastedElts % SelElts == 0) {
59670 // The select has less (wider) elements than the extract operand. Make sure
59671 // that the extraction index can be divided evenly.
59672 unsigned IndexDivisor = CastedElts / SelElts;
59673 if (ExtIdx % IndexDivisor != 0)
59674 return SDValue();
59675 ExtIdx /= IndexDivisor;
59676 } else {
59677 llvm_unreachable("Element count of simple vector types are not divisible?");
59678 }
59679
59680 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
59681 unsigned NarrowElts = SelElts / NarrowingFactor;
59682 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
59683 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
59684 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
59685 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
59686 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
59687 return DAG.getBitcast(VT, NarrowSel);
59688}
59689
59692 const X86Subtarget &Subtarget) {
59693 if (!N->getValueType(0).isSimple())
59694 return SDValue();
59695
59696 MVT VT = N->getSimpleValueType(0);
59697 SDValue InVec = N->getOperand(0);
59698 unsigned IdxVal = N->getConstantOperandVal(1);
59699 EVT InVecVT = InVec.getValueType();
59700 unsigned SizeInBits = VT.getSizeInBits();
59701 unsigned InSizeInBits = InVecVT.getSizeInBits();
59702 unsigned NumSubElts = VT.getVectorNumElements();
59703 unsigned NumInElts = InVecVT.getVectorNumElements();
59704 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59705 SDLoc DL(N);
59706
59707 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
59708 // eventually get combined/lowered into ANDNP) with a concatenated operand,
59709 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
59710 // We let generic combining take over from there to simplify the
59711 // insert/extract and 'not'.
59712 // This pattern emerges during AVX1 legalization. We handle it before lowering
59713 // to avoid complications like splitting constant vector loads.
59714 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && TLI.isTypeLegal(InVecVT) &&
59715 InSizeInBits == 256 && InVec.getOpcode() == ISD::AND) {
59716 auto isConcatenatedNot = [](SDValue V) {
59717 V = peekThroughBitcasts(V);
59718 if (!isBitwiseNot(V))
59719 return false;
59720 SDValue NotOp = V->getOperand(0);
59722 };
59723 if (isConcatenatedNot(InVec.getOperand(0)) ||
59724 isConcatenatedNot(InVec.getOperand(1))) {
59725 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
59726 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
59727 splitVectorIntBinary(InVec, DAG, DL),
59728 N->getOperand(1));
59729 }
59730 }
59731
59732 if (DCI.isBeforeLegalizeOps())
59733 return SDValue();
59734
59735 if (SDValue V = narrowExtractedVectorSelect(N, DL, DAG))
59736 return V;
59737
59739 return getZeroVector(VT, Subtarget, DAG, DL);
59740
59741 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
59742 if (VT.getScalarType() == MVT::i1)
59743 return DAG.getConstant(1, DL, VT);
59744 return getOnesVector(VT, DAG, DL);
59745 }
59746
59747 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
59748 return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
59749
59750 // EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2)
59751 if (IdxVal != 0 && InVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59752 InVec.hasOneUse() && TLI.isTypeLegal(VT) &&
59753 TLI.isTypeLegal(InVec.getOperand(0).getValueType())) {
59754 unsigned NewIdx = IdxVal + InVec.getConstantOperandVal(1);
59755 return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits);
59756 }
59757
59758 // EXTRACT_SUBVECTOR(INSERT_SUBVECTOR(SRC,SUB,C1),C2)
59759 // --> INSERT_SUBVECTOR(EXTRACT_SUBVECTOR(SRC,C2),SUB,C1-C2)
59760 // iff SUB is entirely contained in the extraction.
59761 if (VT.getVectorElementType() != MVT::i1 && TLI.isTypeLegal(VT) &&
59762 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse()) {
59763 SDValue Src = InVec.getOperand(0);
59764 SDValue Sub = InVec.getOperand(1);
59765 EVT SubVT = Sub.getValueType();
59766 uint64_t InsIdx = InVec.getConstantOperandVal(2);
59767 if (IdxVal <= InsIdx &&
59768 (IdxVal + NumSubElts) >= (InsIdx + SubVT.getVectorNumElements())) {
59769 SDValue NewSrc = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src,
59770 DAG.getVectorIdxConstant(IdxVal, DL));
59771 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewSrc, Sub,
59772 DAG.getVectorIdxConstant(InsIdx - IdxVal, DL));
59773 }
59774 }
59775
59776 // If we're extracting an upper subvector see if we'd get the same elements if
59777 // we extracted the lowest subvector instead which should allow
59778 // SimplifyDemandedVectorElts do more simplifications.
59779 if (IdxVal != 0) {
59780 bool AllEquiv = all_of(seq<unsigned>(NumSubElts), [&](unsigned I) {
59781 return IsElementEquivalent(NumInElts, InVec, InVec, I, I + IdxVal);
59782 });
59783 if (AllEquiv)
59784 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
59785 }
59786
59787 // Check if we're extracting a whole broadcasted subvector.
59788 if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
59789 auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
59790 EVT MemVT = MemIntr->getMemoryVT();
59791 if (MemVT == VT) {
59792 // If this is the only use, we can replace with a regular load (this may
59793 // have been missed by SimplifyDemandedVectorElts due to extra uses of the
59794 // memory chain).
59795 if (InVec.hasOneUse()) {
59796 SDValue Ld =
59797 DAG.getLoad(MemVT, DL, MemIntr->getChain(), MemIntr->getBasePtr(),
59798 MemIntr->getMemOperand());
59799 DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), Ld.getValue(1));
59800 return Ld;
59801 }
59802 }
59803 }
59804
59805 // Attempt to extract from the source of a shuffle vector.
59806 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
59807 SmallVector<int, 32> ShuffleMask;
59808 SmallVector<int, 32> ScaledMask;
59809 SmallVector<SDValue, 2> ShuffleInputs;
59810 unsigned NumSubVecs = InSizeInBits / SizeInBits;
59811 // Decode the shuffle mask and scale it so its shuffling subvectors.
59812 if (getTargetShuffleInputs(InVec, ShuffleInputs, ShuffleMask, DAG) &&
59813 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
59814 unsigned SubVecIdx = IdxVal / NumSubElts;
59815 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
59816 return DAG.getUNDEF(VT);
59817 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
59818 return getZeroVector(VT, Subtarget, DAG, DL);
59819 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
59820 if (Src.getValueSizeInBits() == InSizeInBits) {
59821 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
59822 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
59823 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
59824 DL, SizeInBits);
59825 }
59826 }
59827 }
59828
59829 auto IsExtractFree = [](SDValue V) {
59830 if (V.hasOneUse()) {
59832 if (V.getOpcode() == ISD::LOAD)
59833 return true;
59834 }
59835 V = peekThroughBitcasts(V);
59836 if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
59837 return true;
59839 return true;
59840 return V.isUndef();
59841 };
59842
59843 // If we're extracting the lowest subvector and we're the only user,
59844 // we may be able to perform this with a smaller vector width.
59845 unsigned InOpcode = InVec.getOpcode();
59846 if (InVec.hasOneUse()) {
59847 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
59848 // v2f64 CVTDQ2PD(v4i32).
59849 if (InOpcode == ISD::SINT_TO_FP &&
59850 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59851 return DAG.getNode(X86ISD::CVTSI2P, DL, VT, InVec.getOperand(0));
59852 }
59853 // v2f64 CVTUDQ2PD(v4i32).
59854 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
59855 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59856 return DAG.getNode(X86ISD::CVTUI2P, DL, VT, InVec.getOperand(0));
59857 }
59858 // v2f64 CVTPS2PD(v4f32).
59859 if (InOpcode == ISD::FP_EXTEND &&
59860 InVec.getOperand(0).getValueType() == MVT::v4f32) {
59861 return DAG.getNode(X86ISD::VFPEXT, DL, VT, InVec.getOperand(0));
59862 }
59863 }
59864 // v4i32 CVTPS2DQ(v4f32) / CVTPS2UDQ(v4f32).
59865 // v4f32 CVTDQ2PS(v4i32) / CVTUDQ2PS(v4i32).
59866 if ((InOpcode == ISD::FP_TO_SINT || InOpcode == ISD::SINT_TO_FP ||
59867 ((InOpcode == ISD::FP_TO_UINT || InOpcode == ISD::UINT_TO_FP) &&
59868 Subtarget.hasVLX())) &&
59869 (VT == MVT::v4i32 || VT == MVT::v4f32)) {
59870 SDValue Src = InVec.getOperand(0);
59871 if (Src.getValueType().getScalarSizeInBits() == 32)
59872 return DAG.getNode(InOpcode, DL, VT,
59873 extractSubVector(Src, IdxVal, DAG, DL, SizeInBits));
59874 }
59875 if (IdxVal == 0 &&
59876 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
59877 (SizeInBits == 128 || SizeInBits == 256) &&
59878 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
59879 SDValue Ext = InVec.getOperand(0);
59880 if (Ext.getValueSizeInBits() > SizeInBits)
59881 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
59882 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
59883 return DAG.getNode(ExtOp, DL, VT, Ext);
59884 }
59885 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
59886 InVec.getOperand(0).getValueType().is256BitVector() &&
59887 InVec.getOperand(1).getValueType().is256BitVector() &&
59888 InVec.getOperand(2).getValueType().is256BitVector()) {
59889 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
59890 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
59891 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
59892 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
59893 }
59894 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
59895 (SizeInBits == 128 || SizeInBits == 256)) {
59896 SDValue InVecSrc = InVec.getOperand(0);
59897 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
59898 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
59899 return DAG.getNode(InOpcode, DL, VT, Ext);
59900 }
59901
59902 if (SizeInBits == 128 || SizeInBits == 256) {
59903 switch (InOpcode) {
59904 case X86ISD::MOVDDUP:
59905 return DAG.getNode(
59906 InOpcode, DL, VT,
59907 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits));
59908 case X86ISD::PSHUFD:
59909 case X86ISD::VPERMILPI:
59910 if (InVec.getOperand(0).hasOneUse()) {
59911 uint64_t M = InVec.getConstantOperandVal(1) & 255;
59912 M = VT.getScalarSizeInBits() < 64 ? M : (M >> IdxVal);
59913 return DAG.getNode(InOpcode, DL, VT,
59914 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59915 DL, SizeInBits),
59916 DAG.getTargetConstant(M, DL, MVT::i8));
59917 }
59918 break;
59919 case X86ISD::PCMPEQ:
59920 case X86ISD::PCMPGT:
59921 case X86ISD::UNPCKH:
59922 case X86ISD::UNPCKL:
59923 if (IsExtractFree(InVec.getOperand(0)) ||
59924 IsExtractFree(InVec.getOperand(1)))
59925 return DAG.getNode(InOpcode, DL, VT,
59926 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59927 DL, SizeInBits),
59928 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59929 DL, SizeInBits));
59930 break;
59931 case X86ISD::CMPP:
59932 if (IsExtractFree(InVec.getOperand(0)) ||
59933 IsExtractFree(InVec.getOperand(1)))
59934 return DAG.getNode(InOpcode, DL, VT,
59935 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59936 DL, SizeInBits),
59937 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59938 DL, SizeInBits),
59939 InVec.getOperand(2));
59940 break;
59941 case X86ISD::BLENDI:
59942 if (IsExtractFree(InVec.getOperand(0)) ||
59943 IsExtractFree(InVec.getOperand(1))) {
59944 uint64_t M = InVec.getConstantOperandVal(2) & 255;
59945 M = VT.getScalarType() == MVT::i16 ? M : (M >> IdxVal);
59946 return DAG.getNode(InOpcode, DL, VT,
59947 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59948 DL, SizeInBits),
59949 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59950 DL, SizeInBits),
59951 DAG.getTargetConstant(M, DL, MVT::i8));
59952 }
59953 break;
59954 case X86ISD::VPERMV:
59955 if (IdxVal != 0) {
59956 SDValue Mask = InVec.getOperand(0);
59957 SDValue Src = InVec.getOperand(1);
59958 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
59959 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
59960 DL, InSizeInBits);
59961 SDValue Shuffle = DAG.getNode(InOpcode, DL, InVecVT, Mask, Src);
59962 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
59963 }
59964 break;
59965 case X86ISD::VPERMV3:
59966 if (IdxVal != 0) {
59967 SDValue Src0 = InVec.getOperand(0);
59968 SDValue Mask = InVec.getOperand(1);
59969 SDValue Src1 = InVec.getOperand(2);
59970 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
59971 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
59972 DL, InSizeInBits);
59973 SDValue Shuffle =
59974 DAG.getNode(InOpcode, DL, InVecVT, Src0, Mask, Src1);
59975 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
59976 }
59977 break;
59978 }
59979 }
59980 }
59981
59982 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
59983 // as this is very likely to fold into a shuffle/truncation.
59984 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
59985 InVecVT.getScalarSizeInBits() == 64 &&
59986 InVec.getConstantOperandAPInt(1) == 32) {
59987 SDValue Ext =
59988 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
59989 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
59990 }
59991
59992 return SDValue();
59993}
59994
59996 const X86Subtarget &Subtarget) {
59997 using namespace SDPatternMatch;
59998 EVT VT = N->getValueType(0);
59999 SDValue Src = N->getOperand(0);
60000 SDLoc DL(N);
60001
60002 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
60003 // This occurs frequently in our masked scalar intrinsic code and our
60004 // floating point select lowering with AVX512.
60005 // TODO: SimplifyDemandedBits instead?
60006 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
60007 isOneConstant(Src.getOperand(1)))
60008 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
60009
60010 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
60011 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60012 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
60013 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
60014 isNullConstant(Src.getOperand(1)))
60015 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
60016 Src.getOperand(1));
60017
60018 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
60019 // TODO: Move to DAGCombine/SimplifyDemandedBits?
60020 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
60021 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
60022 if (Op.getValueType() != MVT::i64)
60023 return SDValue();
60024 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
60025 if (Op.getOpcode() == Opc &&
60026 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
60027 return Op.getOperand(0);
60028 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
60029 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
60030 if (Ld->getExtensionType() == Ext &&
60031 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
60032 return Op;
60033 if (IsZeroExt) {
60034 KnownBits Known = DAG.computeKnownBits(Op);
60035 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
60036 return Op;
60037 }
60038 return SDValue();
60039 };
60040
60041 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
60042 return DAG.getBitcast(
60043 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
60044 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
60045
60046 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
60047 return DAG.getBitcast(
60048 VT,
60049 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
60050 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
60051 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
60052 }
60053
60054 if (Src.getOpcode() == ISD::BITCAST) {
60055 SDValue SrcOp = Src.getOperand(0);
60056 // Combine (v4i32 (scalar_to_vector (i32 (bitcast (float))))) to MOVD.
60057 if (VT == MVT::v4i32 && SrcOp.getValueType() == MVT::f32)
60058 return DAG.getBitcast(
60059 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, SrcOp));
60060 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (double))))) to MOVQ.
60061 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::f64)
60062 return DAG.getBitcast(
60063 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, SrcOp));
60064 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (mmx))))) to MOVQ2DQ.
60065 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::x86mmx)
60066 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, SrcOp);
60067 }
60068
60069 if (VT == MVT::v4i32) {
60070 SDValue HalfSrc;
60071 // Combine (v4i32 (scalar_to_vector (i32 (anyext (bitcast (f16))))))
60072 // to remove XMM->GPR->XMM moves.
60073 if (sd_match(Src, m_AnyExt(m_BitCast(
60074 m_AllOf(m_SpecificVT(MVT::f16), m_Value(HalfSrc))))))
60075 return DAG.getBitcast(
60076 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, HalfSrc));
60077 }
60078
60079 // See if we're broadcasting the scalar value, in which case just reuse that.
60080 // Ensure the same SDValue from the SDNode use is being used.
60081 if (VT.getScalarType() == Src.getValueType())
60082 for (SDNode *User : Src->users())
60083 if (User->getOpcode() == X86ISD::VBROADCAST &&
60084 Src == User->getOperand(0)) {
60085 unsigned SizeInBits = VT.getFixedSizeInBits();
60086 unsigned BroadcastSizeInBits =
60087 User->getValueSizeInBits(0).getFixedValue();
60088 if (BroadcastSizeInBits == SizeInBits)
60089 return SDValue(User, 0);
60090 if (BroadcastSizeInBits > SizeInBits)
60091 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
60092 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
60093 // coverage.
60094 }
60095
60096 // Check for cases where we've ended up with a scalarized shift, typically
60097 // during type legalization.
60098 switch (Src.getOpcode()) {
60099 case ISD::SHL:
60100 case ISD::SRL:
60101 case ISD::SRA:
60102 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
60103 if (supportedVectorShiftWithImm(VT, Subtarget, Src.getOpcode()) &&
60104 Src.hasOneUse()) {
60105 SDValue SrcVec =
60106 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60107 unsigned Opc = getTargetVShiftUniformOpcode(Src.getOpcode(), false);
60108 return getTargetVShiftByConstNode(Opc, DL, VT.getSimpleVT(), SrcVec,
60109 Amt->getZExtValue(), DAG);
60110 }
60111 }
60112 break;
60113 case ISD::FSHL:
60114 case ISD::FSHR:
60115 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(2))) {
60116 if (supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL) &&
60117 Src.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60118 Src.getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60119 Src.hasOneUse()) {
60120 uint64_t AmtVal =
60121 Amt->getAPIntValue().urem(Src.getScalarValueSizeInBits());
60122 SDValue SrcVec0 =
60123 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60124 SDValue SrcVec1 =
60125 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(1));
60126 return DAG.getNode(Src.getOpcode(), DL, VT, SrcVec0, SrcVec1,
60127 DAG.getConstant(AmtVal, DL, VT));
60128 }
60129 }
60130 break;
60131 }
60132
60133 return SDValue();
60134}
60135
60136// Simplify PMULDQ and PMULUDQ operations.
60139 const X86Subtarget &Subtarget) {
60140 SDValue LHS = N->getOperand(0);
60141 SDValue RHS = N->getOperand(1);
60142
60143 // Canonicalize constant to RHS.
60146 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
60147
60148 // Multiply by zero.
60149 // Don't return RHS as it may contain UNDEFs.
60150 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
60151 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
60152
60153 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
60154 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60155 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
60156 return SDValue(N, 0);
60157
60158 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
60159 // convert it to any_extend_invec, due to the LegalOperations check, do the
60160 // conversion directly to a vector shuffle manually. This exposes combine
60161 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
60162 // combineX86ShufflesRecursively on SSE4.1 targets.
60163 // FIXME: This is basically a hack around several other issues related to
60164 // ANY_EXTEND_VECTOR_INREG.
60165 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
60166 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60167 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60168 LHS.getOperand(0).getValueType() == MVT::v4i32) {
60169 SDLoc dl(N);
60170 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
60171 LHS.getOperand(0), { 0, -1, 1, -1 });
60172 LHS = DAG.getBitcast(MVT::v2i64, LHS);
60173 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60174 }
60175 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
60176 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60177 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60178 RHS.getOperand(0).getValueType() == MVT::v4i32) {
60179 SDLoc dl(N);
60180 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
60181 RHS.getOperand(0), { 0, -1, 1, -1 });
60182 RHS = DAG.getBitcast(MVT::v2i64, RHS);
60183 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60184 }
60185
60186 return SDValue();
60187}
60188
60189// Simplify VPMADDUBSW/VPMADDWD operations.
60192 MVT VT = N->getSimpleValueType(0);
60193 SDValue LHS = N->getOperand(0);
60194 SDValue RHS = N->getOperand(1);
60195 unsigned Opc = N->getOpcode();
60196 bool IsPMADDWD = Opc == X86ISD::VPMADDWD;
60198 "Unexpected PMADD opcode");
60199
60200 // Multiply by zero.
60201 // Don't return LHS/RHS as it may contain UNDEFs.
60202 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
60204 return DAG.getConstant(0, SDLoc(N), VT);
60205
60206 // Constant folding.
60207 APInt LHSUndefs, RHSUndefs;
60208 SmallVector<APInt> LHSBits, RHSBits;
60209 unsigned SrcEltBits = LHS.getScalarValueSizeInBits();
60210 unsigned DstEltBits = VT.getScalarSizeInBits();
60211 if (getTargetConstantBitsFromNode(LHS, SrcEltBits, LHSUndefs, LHSBits) &&
60212 getTargetConstantBitsFromNode(RHS, SrcEltBits, RHSUndefs, RHSBits)) {
60213 SmallVector<APInt> Result;
60214 for (unsigned I = 0, E = LHSBits.size(); I != E; I += 2) {
60215 APInt LHSLo = LHSBits[I + 0], LHSHi = LHSBits[I + 1];
60216 APInt RHSLo = RHSBits[I + 0], RHSHi = RHSBits[I + 1];
60217 LHSLo = IsPMADDWD ? LHSLo.sext(DstEltBits) : LHSLo.zext(DstEltBits);
60218 LHSHi = IsPMADDWD ? LHSHi.sext(DstEltBits) : LHSHi.zext(DstEltBits);
60219 APInt Lo = LHSLo * RHSLo.sext(DstEltBits);
60220 APInt Hi = LHSHi * RHSHi.sext(DstEltBits);
60221 APInt Res = IsPMADDWD ? (Lo + Hi) : Lo.sadd_sat(Hi);
60222 Result.push_back(Res);
60223 }
60224 return getConstVector(Result, VT, DAG, SDLoc(N));
60225 }
60226
60227 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60228 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60229 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60230 return SDValue(N, 0);
60231
60232 return SDValue();
60233}
60234
60235// Simplify VPMADD52L/VPMADD52H operations.
60238 MVT VT = N->getSimpleValueType(0);
60239
60240 bool AddLow = N->getOpcode() == X86ISD::VPMADD52L;
60241 SDValue Op0 = N->getOperand(0);
60242 SDValue Op1 = N->getOperand(1);
60243 SDValue Op2 = N->getOperand(2);
60244 SDLoc DL(N);
60245
60246 APInt C0, C1;
60247 bool HasC0 = X86::isConstantSplat(Op0, C0),
60248 HasC1 = X86::isConstantSplat(Op1, C1);
60249
60250 // lo/hi(C * X) + Z --> lo/hi(X * C) + Z
60251 if (HasC0 && !HasC1)
60252 return DAG.getNode(N->getOpcode(), DL, VT, Op1, Op0, Op2);
60253
60254 // lo(X * 1) + Z --> lo(X) + Z iff X == lo(X)
60255 if (AddLow && HasC1 && C1.trunc(52).isOne()) {
60256 KnownBits KnownOp0 = DAG.computeKnownBits(Op0);
60257 if (KnownOp0.countMinLeadingZeros() >= 12)
60258 return DAG.getNode(ISD::ADD, DL, VT, Op0, Op2);
60259 }
60260
60261 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60262 unsigned NumEltBits = VT.getScalarSizeInBits();
60263 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
60264 DCI))
60265 return SDValue(N, 0);
60266
60267 return SDValue();
60268}
60269
60272 const X86Subtarget &Subtarget) {
60273 EVT VT = N->getValueType(0);
60274 SDValue In = N->getOperand(0);
60275 unsigned Opcode = N->getOpcode();
60276 unsigned InOpcode = In.getOpcode();
60277 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60278 SDLoc DL(N);
60279
60280 // Try to merge vector loads and extend_inreg to an extload.
60281 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
60282 In.hasOneUse()) {
60283 auto *Ld = cast<LoadSDNode>(In);
60284 if (Ld->isSimple()) {
60285 MVT SVT = In.getSimpleValueType().getVectorElementType();
60288 : ISD::ZEXTLOAD;
60289 EVT MemVT = VT.changeVectorElementType(SVT);
60290 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
60291 SDValue Load = DAG.getExtLoad(
60292 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
60293 MemVT, Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
60294 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
60295 return Load;
60296 }
60297 }
60298 }
60299
60300 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
60301 if (Opcode == InOpcode)
60302 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
60303
60304 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
60305 // -> EXTEND_VECTOR_INREG(X).
60306 // TODO: Handle non-zero subvector indices.
60307 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
60308 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
60309 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
60310 In.getValueSizeInBits())
60311 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
60312
60313 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
60314 // TODO: Move to DAGCombine?
60315 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
60316 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
60317 In.getValueSizeInBits() == VT.getSizeInBits()) {
60318 unsigned NumElts = VT.getVectorNumElements();
60319 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
60320 EVT EltVT = In.getOperand(0).getValueType();
60321 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
60322 for (unsigned I = 0; I != NumElts; ++I)
60323 Elts[I * Scale] = In.getOperand(I);
60324 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
60325 }
60326
60327 // Attempt to combine as a shuffle on SSE41+ targets.
60328 if (Subtarget.hasSSE41()) {
60329 SDValue Op(N, 0);
60330 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
60331 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
60332 return Res;
60333 }
60334
60335 return SDValue();
60336}
60337
60340 EVT VT = N->getValueType(0);
60341 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60342 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
60343 return DAG.getConstant(0, SDLoc(N), VT);
60344
60345 // Fold kshiftr(extract_subvector(X,C1),C2)
60346 // --> extract_subvector(kshiftr(X,C1+C2),0)
60347 // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2)
60348 if (N->getOpcode() == X86ISD::KSHIFTR) {
60349 SDLoc DL(N);
60350 if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR ||
60351 N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) {
60352 SDValue Src = N->getOperand(0).getOperand(0);
60353 uint64_t Amt = N->getConstantOperandVal(1) +
60354 N->getOperand(0).getConstantOperandVal(1);
60355 EVT SrcVT = Src.getValueType();
60356 if (TLI.isTypeLegal(SrcVT) && Amt < SrcVT.getVectorNumElements()) {
60357 SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src,
60358 DAG.getTargetConstant(Amt, DL, MVT::i8));
60359 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift,
60360 DAG.getVectorIdxConstant(0, DL));
60361 }
60362 }
60363 }
60364
60365 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60366 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60367 return SDValue(N, 0);
60368
60369 return SDValue();
60370}
60371
60372// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
60373// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
60374// extra instructions between the conversion due to going to scalar and back.
60376 const X86Subtarget &Subtarget) {
60377 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
60378 return SDValue();
60379
60380 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
60381 return SDValue();
60382
60383 if (N->getValueType(0) != MVT::f32 ||
60384 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
60385 return SDValue();
60386
60387 SDLoc dl(N);
60388 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
60389 N->getOperand(0).getOperand(0));
60390 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
60391 DAG.getTargetConstant(4, dl, MVT::i32));
60392 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
60393 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
60394 DAG.getVectorIdxConstant(0, dl));
60395}
60396
60399 const X86Subtarget &Subtarget) {
60400 EVT VT = N->getValueType(0);
60401 bool IsStrict = N->isStrictFPOpcode();
60402 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60403 EVT SrcVT = Src.getValueType();
60404
60405 SDLoc dl(N);
60406 if (SrcVT.getScalarType() == MVT::bf16) {
60407 if (DCI.isAfterLegalizeDAG() && Src.getOpcode() == ISD::FP_ROUND &&
60408 !IsStrict && Src.getOperand(0).getValueType() == VT)
60409 return Src.getOperand(0);
60410
60411 if (!SrcVT.isVector())
60412 return SDValue();
60413
60414 assert(!IsStrict && "Strict FP doesn't support BF16");
60415 if (VT.getVectorElementType() == MVT::f64) {
60416 EVT TmpVT = VT.changeVectorElementType(MVT::f32);
60417 return DAG.getNode(ISD::FP_EXTEND, dl, VT,
60418 DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src));
60419 }
60420 assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
60421 EVT NVT = SrcVT.changeVectorElementType(MVT::i32);
60422 Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src);
60423 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src);
60424 Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT));
60425 return DAG.getBitcast(VT, Src);
60426 }
60427
60428 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60429 return SDValue();
60430
60431 if (Subtarget.hasFP16())
60432 return SDValue();
60433
60434 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
60435 return SDValue();
60436
60437 if (VT.getVectorElementType() != MVT::f32 &&
60438 VT.getVectorElementType() != MVT::f64)
60439 return SDValue();
60440
60441 unsigned NumElts = VT.getVectorNumElements();
60442 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60443 return SDValue();
60444
60445 // Convert the input to vXi16.
60446 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
60447 Src = DAG.getBitcast(IntVT, Src);
60448
60449 // Widen to at least 8 input elements.
60450 if (NumElts < 8) {
60451 unsigned NumConcats = 8 / NumElts;
60452 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
60453 : DAG.getConstant(0, dl, IntVT);
60454 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
60455 Ops[0] = Src;
60456 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
60457 }
60458
60459 // Destination is vXf32 with at least 4 elements.
60460 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
60461 std::max(4U, NumElts));
60462 SDValue Cvt, Chain;
60463 if (IsStrict) {
60464 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
60465 {N->getOperand(0), Src});
60466 Chain = Cvt.getValue(1);
60467 } else {
60468 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
60469 }
60470
60471 if (NumElts < 4) {
60472 assert(NumElts == 2 && "Unexpected size");
60473 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
60474 DAG.getVectorIdxConstant(0, dl));
60475 }
60476
60477 if (IsStrict) {
60478 // Extend to the original VT if necessary.
60479 if (Cvt.getValueType() != VT) {
60480 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
60481 {Chain, Cvt});
60482 Chain = Cvt.getValue(1);
60483 }
60484 return DAG.getMergeValues({Cvt, Chain}, dl);
60485 }
60486
60487 // Extend to the original VT if necessary.
60488 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
60489}
60490
60491// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract.
60494 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
60495 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
60496 "Unknown broadcast load type");
60497
60498 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
60499 SDValue Ptr = MemIntrin->getBasePtr();
60500 SDValue Chain = MemIntrin->getChain();
60501 EVT VT = N->getSimpleValueType(0);
60502 EVT MemVT = MemIntrin->getMemoryVT();
60503
60504 // Look at other users of our base pointer and try to find a wider broadcast.
60505 // The input chain and the size of the memory VT must match.
60506 for (SDNode *User : Ptr->users())
60507 if (User != N && User->getOpcode() == N->getOpcode() &&
60508 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
60509 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
60510 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
60511 MemVT.getSizeInBits() &&
60512 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
60514 MemIntrin->isSimple() && "Illegal broadcast load type");
60516 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
60517 VT.getSizeInBits());
60518 Extract = DAG.getBitcast(VT, Extract);
60519 Extract = DCI.CombineTo(N, Extract, SDValue(User, 1));
60520 return Extract;
60521 }
60522
60523 return SDValue();
60524}
60525
60527 const X86Subtarget &Subtarget) {
60528 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60529 return SDValue();
60530
60531 bool IsStrict = N->isStrictFPOpcode();
60532 EVT VT = N->getValueType(0);
60533 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60534 EVT SrcVT = Src.getValueType();
60535
60536 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
60537 SrcVT.getVectorElementType() != MVT::f32)
60538 return SDValue();
60539
60540 SDLoc dl(N);
60541
60542 SDValue Cvt, Chain;
60543 unsigned NumElts = VT.getVectorNumElements();
60544 if (Subtarget.hasFP16()) {
60545 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64),
60546 // v4f32 (xint_to_fp v4i64))))
60547 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64),
60548 // v8f16 (CVTXI2P v4i64)))
60549 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS &&
60550 Src.getNumOperands() == 2) {
60551 SDValue Cvt0, Cvt1;
60552 SDValue Op0 = Src.getOperand(0);
60553 SDValue Op1 = Src.getOperand(1);
60554 bool IsOp0Strict = Op0->isStrictFPOpcode();
60555 if (Op0.getOpcode() != Op1.getOpcode() ||
60556 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
60557 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
60558 return SDValue();
60559 }
60560 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
60561 if (IsStrict) {
60562 assert(IsOp0Strict && "Op0 must be strict node");
60563 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
60566 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60567 {Op0.getOperand(0), Op0.getOperand(1)});
60568 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60569 {Op1.getOperand(0), Op1.getOperand(1)});
60570 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60571 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
60572 }
60573 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
60575 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
60576 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
60577 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60578 }
60579 return SDValue();
60580 }
60581
60582 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60583 return SDValue();
60584
60585 // Widen to at least 4 input elements.
60586 if (NumElts < 4)
60587 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
60588 DAG.getConstantFP(0.0, dl, SrcVT));
60589
60590 // Destination is v8i16 with at least 8 elements.
60591 EVT CvtVT =
60592 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
60593 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
60594 if (IsStrict) {
60595 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
60596 {N->getOperand(0), Src, Rnd});
60597 Chain = Cvt.getValue(1);
60598 } else {
60599 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
60600 }
60601
60602 // Extract down to real number of elements.
60603 if (NumElts < 8) {
60605 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
60606 DAG.getVectorIdxConstant(0, dl));
60607 }
60608
60609 Cvt = DAG.getBitcast(VT, Cvt);
60610
60611 if (IsStrict)
60612 return DAG.getMergeValues({Cvt, Chain}, dl);
60613
60614 return Cvt;
60615}
60616
60618 SDValue Src = N->getOperand(0);
60619
60620 // Turn MOVDQ2Q+simple_load into an mmx load.
60621 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
60622 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
60623
60624 if (LN->isSimple()) {
60625 SDValue NewLd =
60626 DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(), LN->getBasePtr(),
60627 LN->getPointerInfo(), LN->getBaseAlign(),
60628 LN->getMemOperand()->getFlags());
60629 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
60630 return NewLd;
60631 }
60632 }
60633
60634 return SDValue();
60635}
60636
60639 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
60640 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60641 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
60642 return SDValue(N, 0);
60643
60644 return SDValue();
60645}
60646
60647// Fixup the MMX intrinsics' types: in IR they are expressed with <1 x i64>,
60648// and so SelectionDAGBuilder creates them with v1i64 types, but they need to
60649// use x86mmx instead.
60651 SDLoc dl(N);
60652
60653 bool MadeChange = false, CastReturnVal = false;
60655 for (const SDValue &Arg : N->op_values()) {
60656 if (Arg.getValueType() == MVT::v1i64) {
60657 MadeChange = true;
60658 Args.push_back(DAG.getBitcast(MVT::x86mmx, Arg));
60659 } else
60660 Args.push_back(Arg);
60661 }
60662 SDVTList VTs = N->getVTList();
60663 SDVTList NewVTs = VTs;
60664 if (VTs.NumVTs > 0 && VTs.VTs[0] == MVT::v1i64) {
60665 SmallVector<EVT> NewVTArr(ArrayRef<EVT>(VTs.VTs, VTs.NumVTs));
60666 NewVTArr[0] = MVT::x86mmx;
60667 NewVTs = DAG.getVTList(NewVTArr);
60668 MadeChange = true;
60669 CastReturnVal = true;
60670 }
60671
60672 if (MadeChange) {
60673 SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args);
60674 if (CastReturnVal) {
60676 for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i)
60677 Returns.push_back(Result.getValue(i));
60678 Returns[0] = DAG.getBitcast(MVT::v1i64, Returns[0]);
60679 return DAG.getMergeValues(Returns, dl);
60680 }
60681 return Result;
60682 }
60683 return SDValue();
60684}
60687 if (!DCI.isBeforeLegalize())
60688 return SDValue();
60689
60690 unsigned IntNo = N->getConstantOperandVal(0);
60691 const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo);
60692
60693 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60694 return FixupMMXIntrinsicTypes(N, DAG);
60695
60696 return SDValue();
60697}
60698
60701 if (!DCI.isBeforeLegalize())
60702 return SDValue();
60703
60704 unsigned IntNo = N->getConstantOperandVal(1);
60705 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60706
60707 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60708 return FixupMMXIntrinsicTypes(N, DAG);
60709
60710 return SDValue();
60711}
60712
60715 if (!DCI.isBeforeLegalize())
60716 return SDValue();
60717
60718 unsigned IntNo = N->getConstantOperandVal(1);
60719 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60720
60721 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60722 return FixupMMXIntrinsicTypes(N, DAG);
60723
60724 return SDValue();
60725}
60726
60728 DAGCombinerInfo &DCI) const {
60729 SelectionDAG &DAG = DCI.DAG;
60730 switch (N->getOpcode()) {
60731 // clang-format off
60732 default: break;
60734 return combineSCALAR_TO_VECTOR(N, DAG, Subtarget);
60736 case X86ISD::PEXTRW:
60737 case X86ISD::PEXTRB:
60738 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
60740 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
60742 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
60744 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
60745 case ISD::VSELECT:
60746 case ISD::SELECT:
60747 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
60748 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
60749 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
60750 case X86ISD::CMP: return combineCMP(N, DAG, DCI, Subtarget);
60751 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
60752 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
60753 case X86ISD::ADD:
60754 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
60755 case X86ISD::CLOAD:
60756 case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
60757 case X86ISD::SBB: return combineSBB(N, DAG);
60758 case X86ISD::ADC: return combineADC(N, DAG, DCI);
60759 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
60760 case ISD::SHL: return combineShiftLeft(N, DAG, Subtarget);
60761 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
60762 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
60763 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
60764 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
60765 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
60766 case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget);
60767 case ISD::AVGCEILS:
60768 case ISD::AVGCEILU:
60769 case ISD::AVGFLOORS:
60770 case ISD::AVGFLOORU: return combineAVG(N, DAG, DCI, Subtarget);
60771 case X86ISD::BEXTR:
60772 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
60773 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
60774 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
60775 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
60776 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
60778 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
60779 case ISD::SINT_TO_FP:
60781 return combineSIntToFP(N, DAG, DCI, Subtarget);
60782 case ISD::UINT_TO_FP:
60784 return combineUIntToFP(N, DAG, Subtarget);
60785 case ISD::FP_TO_SINT: return combineFPToSInt(N, DAG, Subtarget);
60786 case ISD::LRINT:
60787 case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget);
60788 case ISD::FADD:
60789 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
60790 case X86ISD::VFCMULC:
60791 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
60792 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
60793 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
60794 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
60795 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
60796 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
60797 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
60798 case X86ISD::FXOR:
60799 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
60800 case X86ISD::FMIN:
60801 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
60802 case ISD::FMINNUM:
60803 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
60804 case X86ISD::CVTSI2P:
60805 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
60806 case X86ISD::CVTP2SI:
60807 case X86ISD::CVTP2UI:
60809 case X86ISD::CVTTP2SI:
60811 case X86ISD::CVTTP2UI:
60812 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
60814 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
60815 case X86ISD::BT: return combineBT(N, DAG, DCI);
60816 case ISD::ANY_EXTEND:
60817 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
60818 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
60819 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
60823 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
60824 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
60825 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
60826 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
60827 case X86ISD::PACKSS:
60828 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
60829 case X86ISD::HADD:
60830 case X86ISD::HSUB:
60831 case X86ISD::FHADD:
60832 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
60833 case X86ISD::VSHL:
60834 case X86ISD::VSRA:
60835 case X86ISD::VSRL:
60836 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
60837 case X86ISD::VSHLI:
60838 case X86ISD::VSRAI:
60839 case X86ISD::VSRLI:
60840 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
60842 case X86ISD::PINSRB:
60843 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
60844 case X86ISD::SHUFP: // Handle all target specific shuffles
60845 case X86ISD::INSERTPS:
60846 case X86ISD::EXTRQI:
60847 case X86ISD::INSERTQI:
60848 case X86ISD::VALIGN:
60849 case X86ISD::PALIGNR:
60850 case X86ISD::VSHLDQ:
60851 case X86ISD::VSRLDQ:
60852 case X86ISD::BLENDI:
60853 case X86ISD::UNPCKH:
60854 case X86ISD::UNPCKL:
60855 case X86ISD::MOVHLPS:
60856 case X86ISD::MOVLHPS:
60857 case X86ISD::PSHUFB:
60858 case X86ISD::PSHUFD:
60859 case X86ISD::PSHUFHW:
60860 case X86ISD::PSHUFLW:
60861 case X86ISD::MOVSHDUP:
60862 case X86ISD::MOVSLDUP:
60863 case X86ISD::MOVDDUP:
60864 case X86ISD::MOVSS:
60865 case X86ISD::MOVSD:
60866 case X86ISD::MOVSH:
60867 case X86ISD::VBROADCAST:
60868 case X86ISD::VPPERM:
60869 case X86ISD::VPERMI:
60870 case X86ISD::VPERMV:
60871 case X86ISD::VPERMV3:
60872 case X86ISD::VPERMIL2:
60873 case X86ISD::VPERMILPI:
60874 case X86ISD::VPERMILPV:
60875 case X86ISD::VPERM2X128:
60876 case X86ISD::SHUF128:
60877 case X86ISD::VZEXT_MOVL:
60878 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
60879 case X86ISD::FMADD_RND:
60880 case X86ISD::FMSUB:
60882 case X86ISD::FMSUB_RND:
60883 case X86ISD::FNMADD:
60885 case X86ISD::FNMADD_RND:
60886 case X86ISD::FNMSUB:
60888 case X86ISD::FNMSUB_RND:
60889 case ISD::FMA:
60890 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
60893 case X86ISD::FMADDSUB:
60894 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
60895 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
60896 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
60897 case X86ISD::MGATHER:
60898 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
60899 case ISD::MGATHER:
60900 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
60901 case X86ISD::PCMPEQ:
60902 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
60903 case X86ISD::PMULDQ:
60904 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
60905 case X86ISD::VPMADDUBSW:
60906 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
60907 case X86ISD::VPMADD52L:
60908 case X86ISD::VPMADD52H: return combineVPMADD52LH(N, DAG, DCI);
60909 case X86ISD::KSHIFTL:
60910 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
60911 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
60913 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, DCI, Subtarget);
60915 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
60917 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
60918 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
60919 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
60920 case ISD::INTRINSIC_WO_CHAIN: return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
60921 case ISD::INTRINSIC_W_CHAIN: return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
60922 case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI);
60924 case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget);
60925 // clang-format on
60926 }
60927
60928 return SDValue();
60929}
60930
60932 return Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64);
60933}
60934
60935// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
60937 EVT ExtVT) const {
60938 return Subtarget.hasAVX512() || !VT.isVector();
60939}
60940
60942 if (!isTypeLegal(VT))
60943 return false;
60944
60945 // There are no vXi8 shifts.
60946 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
60947 return false;
60948
60949 // TODO: Almost no 8-bit ops are desirable because they have no actual
60950 // size/speed advantages vs. 32-bit ops, but they do have a major
60951 // potential disadvantage by causing partial register stalls.
60952 //
60953 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
60954 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
60955 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
60956 // check for a constant operand to the multiply.
60957 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
60958 return false;
60959
60960 // i16 instruction encodings are longer and some i16 instructions are slow,
60961 // so those are not desirable.
60962 if (VT == MVT::i16) {
60963 switch (Opc) {
60964 default:
60965 break;
60966 case ISD::LOAD:
60967 case ISD::SIGN_EXTEND:
60968 case ISD::ZERO_EXTEND:
60969 case ISD::ANY_EXTEND:
60970 case ISD::MUL:
60971 return false;
60972 case ISD::SHL:
60973 case ISD::SRA:
60974 case ISD::SRL:
60975 case ISD::SUB:
60976 case ISD::ADD:
60977 case ISD::AND:
60978 case ISD::OR:
60979 case ISD::XOR:
60980 // NDD instruction never has "partial register write" issue b/c it has
60981 // destination register's upper bits [63:OSIZE]) zeroed even when
60982 // OSIZE=8/16.
60983 return Subtarget.hasNDD();
60984 }
60985 }
60986
60987 // Any legal type not explicitly accounted for above here is desirable.
60988 return true;
60989}
60990
60992 SDValue Value, SDValue Addr,
60993 int JTI,
60994 SelectionDAG &DAG) const {
60995 const Module *M = DAG.getMachineFunction().getFunction().getParent();
60996 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
60997 if (IsCFProtectionSupported) {
60998 // In case control-flow branch protection is enabled, we need to add
60999 // notrack prefix to the indirect branch.
61000 // In order to do that we create NT_BRIND SDNode.
61001 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
61002 SDValue Chain = Value;
61003 // Jump table debug info is only needed if CodeView is enabled.
61005 Chain = DAG.getJumpTableDebugInfo(JTI, Chain, dl);
61006 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Chain, Addr);
61007 }
61008
61009 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
61010}
61011
61014 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
61016 EVT VT = LogicOp->getValueType(0);
61017 EVT OpVT = SETCC0->getOperand(0).getValueType();
61018 if (!VT.isInteger())
61020
61021 if (VT.isVector())
61026
61027 // Don't use `NotAnd` as even though `not` is generally shorter code size than
61028 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
61029 // `NotAnd` applies, `AddAnd` does as well.
61030 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
61031 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
61033}
61034
61036 EVT VT = Op.getValueType();
61037 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
61038 isa<ConstantSDNode>(Op.getOperand(1));
61039
61040 // i16 is legal, but undesirable since i16 instruction encodings are longer
61041 // and some i16 instructions are slow.
61042 // 8-bit multiply-by-constant can usually be expanded to something cheaper
61043 // using LEA and/or other ALU ops.
61044 if (VT != MVT::i16 && !Is8BitMulByConstant)
61045 return false;
61046
61047 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
61048 if (!Op.hasOneUse())
61049 return false;
61050 SDNode *User = *Op->user_begin();
61052 return false;
61053 auto *Ld = cast<LoadSDNode>(Load);
61054 auto *St = cast<StoreSDNode>(User);
61055 return Ld->getBasePtr() == St->getBasePtr();
61056 };
61057
61058 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
61059 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
61060 return false;
61061 if (!Op.hasOneUse())
61062 return false;
61063 SDNode *User = *Op->user_begin();
61064 if (User->getOpcode() != ISD::ATOMIC_STORE)
61065 return false;
61066 auto *Ld = cast<AtomicSDNode>(Load);
61067 auto *St = cast<AtomicSDNode>(User);
61068 return Ld->getBasePtr() == St->getBasePtr();
61069 };
61070
61071 auto IsFoldableZext = [](SDValue Op) {
61072 if (!Op.hasOneUse())
61073 return false;
61074 SDNode *User = *Op->user_begin();
61075 EVT VT = User->getValueType(0);
61076 return (User->getOpcode() == ISD::ZERO_EXTEND &&
61077 (VT == MVT::i32 || VT == MVT::i64));
61078 };
61079
61080 bool Commute = false;
61081 switch (Op.getOpcode()) {
61082 default: return false;
61083 case ISD::SIGN_EXTEND:
61084 case ISD::ZERO_EXTEND:
61085 case ISD::ANY_EXTEND:
61086 break;
61087 case ISD::SHL:
61088 case ISD::SRA:
61089 case ISD::SRL: {
61090 SDValue N0 = Op.getOperand(0);
61091 // Look out for (store (shl (load), x)).
61092 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
61093 return false;
61094 break;
61095 }
61096 case ISD::MUL:
61097 // When ZU is enabled, we prefer to not promote for MUL by a constant
61098 // when there is an opportunity to fold a zext with imulzu.
61099 if (Subtarget.hasZU() && IsFoldableZext(Op) &&
61100 (isa<ConstantSDNode>(Op.getOperand(0)) ||
61101 isa<ConstantSDNode>(Op.getOperand(1))))
61102 return false;
61103 [[fallthrough]];
61104 case ISD::ADD:
61105 case ISD::AND:
61106 case ISD::OR:
61107 case ISD::XOR:
61108 Commute = true;
61109 [[fallthrough]];
61110 case ISD::SUB: {
61111 SDValue N0 = Op.getOperand(0);
61112 SDValue N1 = Op.getOperand(1);
61113 // Avoid disabling potential load folding opportunities.
61114 if (X86::mayFoldLoad(N1, Subtarget) &&
61115 (!Commute || !isa<ConstantSDNode>(N0) ||
61116 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
61117 return false;
61118 if (X86::mayFoldLoad(N0, Subtarget) &&
61119 ((Commute && !isa<ConstantSDNode>(N1)) ||
61120 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
61121 return false;
61122 if (IsFoldableAtomicRMW(N0, Op) ||
61123 (Commute && IsFoldableAtomicRMW(N1, Op)))
61124 return false;
61125 }
61126 }
61127
61128 PVT = MVT::i32;
61129 return true;
61130}
61131
61132//===----------------------------------------------------------------------===//
61133// X86 Inline Assembly Support
61134//===----------------------------------------------------------------------===//
61135
61138 .Case("{@cca}", X86::COND_A)
61139 .Case("{@ccae}", X86::COND_AE)
61140 .Case("{@ccb}", X86::COND_B)
61141 .Case("{@ccbe}", X86::COND_BE)
61142 .Case("{@ccc}", X86::COND_B)
61143 .Case("{@cce}", X86::COND_E)
61144 .Case("{@ccz}", X86::COND_E)
61145 .Case("{@ccg}", X86::COND_G)
61146 .Case("{@ccge}", X86::COND_GE)
61147 .Case("{@ccl}", X86::COND_L)
61148 .Case("{@ccle}", X86::COND_LE)
61149 .Case("{@ccna}", X86::COND_BE)
61150 .Case("{@ccnae}", X86::COND_B)
61151 .Case("{@ccnb}", X86::COND_AE)
61152 .Case("{@ccnbe}", X86::COND_A)
61153 .Case("{@ccnc}", X86::COND_AE)
61154 .Case("{@ccne}", X86::COND_NE)
61155 .Case("{@ccnz}", X86::COND_NE)
61156 .Case("{@ccng}", X86::COND_LE)
61157 .Case("{@ccnge}", X86::COND_L)
61158 .Case("{@ccnl}", X86::COND_GE)
61159 .Case("{@ccnle}", X86::COND_G)
61160 .Case("{@ccno}", X86::COND_NO)
61161 .Case("{@ccnp}", X86::COND_NP)
61162 .Case("{@ccns}", X86::COND_NS)
61163 .Case("{@cco}", X86::COND_O)
61164 .Case("{@ccp}", X86::COND_P)
61165 .Case("{@ccs}", X86::COND_S)
61167 return Cond;
61168}
61169
61170/// Given a constraint letter, return the type of constraint for this target.
61173 if (Constraint.size() == 1) {
61174 switch (Constraint[0]) {
61175 case 'R':
61176 case 'q':
61177 case 'Q':
61178 case 'f':
61179 case 't':
61180 case 'u':
61181 case 'y':
61182 case 'x':
61183 case 'v':
61184 case 'l':
61185 case 'k': // AVX512 masking registers.
61186 return C_RegisterClass;
61187 case 'a':
61188 case 'b':
61189 case 'c':
61190 case 'd':
61191 case 'S':
61192 case 'D':
61193 case 'A':
61194 return C_Register;
61195 case 'I':
61196 case 'J':
61197 case 'K':
61198 case 'N':
61199 case 'G':
61200 case 'L':
61201 case 'M':
61202 return C_Immediate;
61203 case 'C':
61204 case 'e':
61205 case 'Z':
61206 return C_Other;
61207 default:
61208 break;
61209 }
61210 }
61211 else if (Constraint.size() == 2) {
61212 switch (Constraint[0]) {
61213 default:
61214 break;
61215 case 'W':
61216 if (Constraint[1] != 's')
61217 break;
61218 return C_Other;
61219 case 'Y':
61220 switch (Constraint[1]) {
61221 default:
61222 break;
61223 case 'z':
61224 return C_Register;
61225 case 'i':
61226 case 'm':
61227 case 'k':
61228 case 't':
61229 case '2':
61230 return C_RegisterClass;
61231 }
61232 break;
61233 case 'j':
61234 switch (Constraint[1]) {
61235 default:
61236 break;
61237 case 'r':
61238 case 'R':
61239 return C_RegisterClass;
61240 }
61241 }
61242 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
61243 return C_Other;
61244 return TargetLowering::getConstraintType(Constraint);
61245}
61246
61247/// Examine constraint type and operand type and determine a weight value.
61248/// This object must already have been set up with the operand type
61249/// and the current alternative constraint selected.
61252 AsmOperandInfo &Info, const char *Constraint) const {
61254 Value *CallOperandVal = Info.CallOperandVal;
61255 // If we don't have a value, we can't do a match,
61256 // but allow it at the lowest weight.
61257 if (!CallOperandVal)
61258 return CW_Default;
61259 Type *Ty = CallOperandVal->getType();
61260 // Look at the constraint type.
61261 switch (*Constraint) {
61262 default:
61264 [[fallthrough]];
61265 case 'R':
61266 case 'q':
61267 case 'Q':
61268 case 'a':
61269 case 'b':
61270 case 'c':
61271 case 'd':
61272 case 'S':
61273 case 'D':
61274 case 'A':
61275 if (CallOperandVal->getType()->isIntegerTy())
61276 Wt = CW_SpecificReg;
61277 break;
61278 case 'f':
61279 case 't':
61280 case 'u':
61281 if (Ty->isFloatingPointTy())
61282 Wt = CW_SpecificReg;
61283 break;
61284 case 'y':
61285 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61286 Wt = CW_SpecificReg;
61287 break;
61288 case 'Y':
61289 if (StringRef(Constraint).size() != 2)
61290 break;
61291 switch (Constraint[1]) {
61292 default:
61293 return CW_Invalid;
61294 // XMM0
61295 case 'z':
61296 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61297 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
61298 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
61299 return CW_SpecificReg;
61300 return CW_Invalid;
61301 // Conditional OpMask regs (AVX512)
61302 case 'k':
61303 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61304 return CW_Register;
61305 return CW_Invalid;
61306 // Any MMX reg
61307 case 'm':
61308 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61309 return CW_SpecificReg;
61310 return CW_Invalid;
61311 // Any SSE reg when ISA >= SSE2, same as 'x'
61312 case 'i':
61313 case 't':
61314 case '2':
61315 if (!Subtarget.hasSSE2())
61316 return CW_Invalid;
61317 break;
61318 }
61319 break;
61320 case 'j':
61321 if (StringRef(Constraint).size() != 2)
61322 break;
61323 switch (Constraint[1]) {
61324 default:
61325 return CW_Invalid;
61326 case 'r':
61327 case 'R':
61328 if (CallOperandVal->getType()->isIntegerTy())
61329 Wt = CW_SpecificReg;
61330 break;
61331 }
61332 break;
61333 case 'v':
61334 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
61335 Wt = CW_Register;
61336 [[fallthrough]];
61337 case 'x':
61338 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61339 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
61340 Wt = CW_Register;
61341 break;
61342 case 'k':
61343 // Enable conditional vector operations using %k<#> registers.
61344 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61345 Wt = CW_Register;
61346 break;
61347 case 'I':
61348 if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))
61349 if (C->getZExtValue() <= 31)
61350 Wt = CW_Constant;
61351 break;
61352 case 'J':
61353 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61354 if (C->getZExtValue() <= 63)
61355 Wt = CW_Constant;
61356 break;
61357 case 'K':
61358 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61359 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
61360 Wt = CW_Constant;
61361 break;
61362 case 'L':
61363 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61364 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
61365 Wt = CW_Constant;
61366 break;
61367 case 'M':
61368 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61369 if (C->getZExtValue() <= 3)
61370 Wt = CW_Constant;
61371 break;
61372 case 'N':
61373 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61374 if (C->getZExtValue() <= 0xff)
61375 Wt = CW_Constant;
61376 break;
61377 case 'G':
61378 case 'C':
61379 if (isa<ConstantFP>(CallOperandVal))
61380 Wt = CW_Constant;
61381 break;
61382 case 'e':
61383 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61384 if ((C->getSExtValue() >= -0x80000000LL) &&
61385 (C->getSExtValue() <= 0x7fffffffLL))
61386 Wt = CW_Constant;
61387 break;
61388 case 'Z':
61389 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61390 if (C->getZExtValue() <= 0xffffffff)
61391 Wt = CW_Constant;
61392 break;
61393 }
61394 return Wt;
61395}
61396
61397/// Try to replace an X constraint, which matches anything, with another that
61398/// has more specific requirements based on the type of the corresponding
61399/// operand.
61401LowerXConstraint(EVT ConstraintVT) const {
61402 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
61403 // 'f' like normal targets.
61404 if (ConstraintVT.isFloatingPoint()) {
61405 if (Subtarget.hasSSE1())
61406 return "x";
61407 }
61408
61409 return TargetLowering::LowerXConstraint(ConstraintVT);
61410}
61411
61412// Lower @cc targets via setcc.
61414 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
61415 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
61416 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
61417 if (Cond == X86::COND_INVALID)
61418 return SDValue();
61419 // Check that return type is valid.
61420 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
61421 OpInfo.ConstraintVT.getSizeInBits() < 8)
61422 report_fatal_error("Glue output operand is of invalid type");
61423
61424 // Get EFLAGS register. Only update chain when copyfrom is glued.
61425 if (Glue.getNode()) {
61426 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
61427 Chain = Glue.getValue(1);
61428 } else
61429 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
61430 // Extract CC code.
61431 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
61432 // Extend to 32-bits
61433 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
61434
61435 return Result;
61436}
61437
61438/// Lower the specified operand into the Ops vector.
61439/// If it is invalid, don't add anything to Ops.
61441 StringRef Constraint,
61442 std::vector<SDValue> &Ops,
61443 SelectionDAG &DAG) const {
61444 SDValue Result;
61445 char ConstraintLetter = Constraint[0];
61446 switch (ConstraintLetter) {
61447 default: break;
61448 case 'I':
61449 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61450 if (C->getZExtValue() <= 31) {
61451 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61452 Op.getValueType());
61453 break;
61454 }
61455 }
61456 return;
61457 case 'J':
61458 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61459 if (C->getZExtValue() <= 63) {
61460 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61461 Op.getValueType());
61462 break;
61463 }
61464 }
61465 return;
61466 case 'K':
61467 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61468 if (isInt<8>(C->getSExtValue())) {
61469 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61470 Op.getValueType());
61471 break;
61472 }
61473 }
61474 return;
61475 case 'L':
61476 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61477 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
61478 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
61479 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
61480 Op.getValueType());
61481 break;
61482 }
61483 }
61484 return;
61485 case 'M':
61486 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61487 if (C->getZExtValue() <= 3) {
61488 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61489 Op.getValueType());
61490 break;
61491 }
61492 }
61493 return;
61494 case 'N':
61495 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61496 if (C->getZExtValue() <= 255) {
61497 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61498 Op.getValueType());
61499 break;
61500 }
61501 }
61502 return;
61503 case 'O':
61504 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61505 if (C->getZExtValue() <= 127) {
61506 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61507 Op.getValueType());
61508 break;
61509 }
61510 }
61511 return;
61512 case 'e': {
61513 // 32-bit signed value
61514 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61516 C->getSExtValue())) {
61517 // Widen to 64 bits here to get it sign extended.
61518 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
61519 break;
61520 }
61521 // FIXME gcc accepts some relocatable values here too, but only in certain
61522 // memory models; it's complicated.
61523 }
61524 return;
61525 }
61526 case 'W': {
61527 assert(Constraint[1] == 's');
61528 // Op is a BlockAddressSDNode or a GlobalAddressSDNode with an optional
61529 // offset.
61530 if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
61531 Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
61532 BA->getValueType(0)));
61533 } else {
61534 int64_t Offset = 0;
61535 if (Op->getOpcode() == ISD::ADD &&
61536 isa<ConstantSDNode>(Op->getOperand(1))) {
61537 Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
61538 Op = Op->getOperand(0);
61539 }
61540 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61541 Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
61542 GA->getValueType(0), Offset));
61543 }
61544 return;
61545 }
61546 case 'Z': {
61547 // 32-bit unsigned value
61548 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61550 C->getZExtValue())) {
61551 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61552 Op.getValueType());
61553 break;
61554 }
61555 }
61556 // FIXME gcc accepts some relocatable values here too, but only in certain
61557 // memory models; it's complicated.
61558 return;
61559 }
61560 case 'i': {
61561 // Literal immediates are always ok.
61562 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
61563 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
61564 BooleanContent BCont = getBooleanContents(MVT::i64);
61565 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
61567 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
61568 : CST->getSExtValue();
61569 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
61570 break;
61571 }
61572
61573 // In any sort of PIC mode addresses need to be computed at runtime by
61574 // adding in a register or some sort of table lookup. These can't
61575 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
61576 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
61578 return;
61579
61580 // If we are in non-pic codegen mode, we allow the address of a global (with
61581 // an optional displacement) to be used with 'i'.
61582 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61583 // If we require an extra load to get this address, as in PIC mode, we
61584 // can't accept it.
61586 Subtarget.classifyGlobalReference(GA->getGlobal())))
61587 return;
61588 break;
61589 }
61590 }
61591
61592 if (Result.getNode()) {
61593 Ops.push_back(Result);
61594 return;
61595 }
61596 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
61597}
61598
61599/// Check if \p RC is a general purpose register class.
61600/// I.e., GR* or one of their variant.
61601static bool isGRClass(const TargetRegisterClass &RC) {
61602 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
61603 RC.hasSuperClassEq(&X86::GR16RegClass) ||
61604 RC.hasSuperClassEq(&X86::GR32RegClass) ||
61605 RC.hasSuperClassEq(&X86::GR64RegClass) ||
61606 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
61607}
61608
61609/// Check if \p RC is a vector register class.
61610/// I.e., FR* / VR* or one of their variant.
61611static bool isFRClass(const TargetRegisterClass &RC) {
61612 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
61613 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
61614 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
61615 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
61616 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
61617 RC.hasSuperClassEq(&X86::VR512RegClass);
61618}
61619
61620/// Check if \p RC is a mask register class.
61621/// I.e., VK* or one of their variant.
61622static bool isVKClass(const TargetRegisterClass &RC) {
61623 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
61624 RC.hasSuperClassEq(&X86::VK2RegClass) ||
61625 RC.hasSuperClassEq(&X86::VK4RegClass) ||
61626 RC.hasSuperClassEq(&X86::VK8RegClass) ||
61627 RC.hasSuperClassEq(&X86::VK16RegClass) ||
61628 RC.hasSuperClassEq(&X86::VK32RegClass) ||
61629 RC.hasSuperClassEq(&X86::VK64RegClass);
61630}
61631
61632static bool useEGPRInlineAsm(const X86Subtarget &Subtarget) {
61633 return Subtarget.hasEGPR() && Subtarget.useInlineAsmGPR32();
61634}
61635
61636std::pair<unsigned, const TargetRegisterClass *>
61638 StringRef Constraint,
61639 MVT VT) const {
61640 // First, see if this is a constraint that directly corresponds to an LLVM
61641 // register class.
61642 if (Constraint.size() == 1) {
61643 // GCC Constraint Letters
61644 switch (Constraint[0]) {
61645 default: break;
61646 // 'A' means [ER]AX + [ER]DX.
61647 case 'A':
61648 if (Subtarget.is64Bit())
61649 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
61650 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
61651 "Expecting 64, 32 or 16 bit subtarget");
61652 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
61653
61654 // TODO: Slight differences here in allocation order and leaving
61655 // RIP in the class. Do they matter any more here than they do
61656 // in the normal allocation?
61657 case 'k':
61658 if (Subtarget.hasAVX512()) {
61659 if (VT == MVT::v1i1 || VT == MVT::i1)
61660 return std::make_pair(0U, &X86::VK1RegClass);
61661 if (VT == MVT::v8i1 || VT == MVT::i8)
61662 return std::make_pair(0U, &X86::VK8RegClass);
61663 if (VT == MVT::v16i1 || VT == MVT::i16)
61664 return std::make_pair(0U, &X86::VK16RegClass);
61665 }
61666 if (Subtarget.hasBWI()) {
61667 if (VT == MVT::v32i1 || VT == MVT::i32)
61668 return std::make_pair(0U, &X86::VK32RegClass);
61669 if (VT == MVT::v64i1 || VT == MVT::i64)
61670 return std::make_pair(0U, &X86::VK64RegClass);
61671 }
61672 break;
61673 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
61674 if (Subtarget.is64Bit()) {
61675 if (VT == MVT::i8 || VT == MVT::i1)
61676 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61677 ? &X86::GR8RegClass
61678 : &X86::GR8_NOREX2RegClass);
61679 if (VT == MVT::i16)
61680 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61681 ? &X86::GR16RegClass
61682 : &X86::GR16_NOREX2RegClass);
61683 if (VT == MVT::i32 || VT == MVT::f32)
61684 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61685 ? &X86::GR32RegClass
61686 : &X86::GR32_NOREX2RegClass);
61687 if (VT != MVT::f80 && !VT.isVector())
61688 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61689 ? &X86::GR64RegClass
61690 : &X86::GR64_NOREX2RegClass);
61691 break;
61692 }
61693 [[fallthrough]];
61694 // 32-bit fallthrough
61695 case 'Q': // Q_REGS
61696 if (VT == MVT::i8 || VT == MVT::i1)
61697 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
61698 if (VT == MVT::i16)
61699 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
61700 if (VT == MVT::i32 || VT == MVT::f32 ||
61701 (!VT.isVector() && !Subtarget.is64Bit()))
61702 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
61703 if (VT != MVT::f80 && !VT.isVector())
61704 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
61705 break;
61706 case 'r': // GENERAL_REGS
61707 case 'l': // INDEX_REGS
61708 if (VT == MVT::i8 || VT == MVT::i1)
61709 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61710 ? &X86::GR8RegClass
61711 : &X86::GR8_NOREX2RegClass);
61712 if (VT == MVT::i16)
61713 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61714 ? &X86::GR16RegClass
61715 : &X86::GR16_NOREX2RegClass);
61716 if (VT == MVT::i32 || VT == MVT::f32 ||
61717 (!VT.isVector() && !Subtarget.is64Bit()))
61718 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61719 ? &X86::GR32RegClass
61720 : &X86::GR32_NOREX2RegClass);
61721 if (VT != MVT::f80 && !VT.isVector())
61722 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61723 ? &X86::GR64RegClass
61724 : &X86::GR64_NOREX2RegClass);
61725 break;
61726 case 'R': // LEGACY_REGS
61727 if (VT == MVT::i8 || VT == MVT::i1)
61728 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
61729 if (VT == MVT::i16)
61730 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
61731 if (VT == MVT::i32 || VT == MVT::f32 ||
61732 (!VT.isVector() && !Subtarget.is64Bit()))
61733 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
61734 if (VT != MVT::f80 && !VT.isVector())
61735 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
61736 break;
61737 case 'f': // FP Stack registers.
61738 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
61739 // value to the correct fpstack register class.
61740 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
61741 return std::make_pair(0U, &X86::RFP32RegClass);
61742 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
61743 return std::make_pair(0U, &X86::RFP64RegClass);
61744 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
61745 return std::make_pair(0U, &X86::RFP80RegClass);
61746 break;
61747 case 'y': // MMX_REGS if MMX allowed.
61748 if (!Subtarget.hasMMX()) break;
61749 return std::make_pair(0U, &X86::VR64RegClass);
61750 case 'v':
61751 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
61752 if (!Subtarget.hasSSE1()) break;
61753 bool VConstraint = (Constraint[0] == 'v');
61754
61755 switch (VT.SimpleTy) {
61756 default: break;
61757 // Scalar SSE types.
61758 case MVT::f16:
61759 if (VConstraint && Subtarget.hasFP16())
61760 return std::make_pair(0U, &X86::FR16XRegClass);
61761 break;
61762 case MVT::f32:
61763 case MVT::i32:
61764 if (VConstraint && Subtarget.hasVLX())
61765 return std::make_pair(0U, &X86::FR32XRegClass);
61766 return std::make_pair(0U, &X86::FR32RegClass);
61767 case MVT::f64:
61768 case MVT::i64:
61769 if (VConstraint && Subtarget.hasVLX())
61770 return std::make_pair(0U, &X86::FR64XRegClass);
61771 return std::make_pair(0U, &X86::FR64RegClass);
61772 case MVT::i128:
61773 if (Subtarget.is64Bit()) {
61774 if (VConstraint && Subtarget.hasVLX())
61775 return std::make_pair(0U, &X86::VR128XRegClass);
61776 return std::make_pair(0U, &X86::VR128RegClass);
61777 }
61778 break;
61779 // Vector types and fp128.
61780 case MVT::v8f16:
61781 if (!Subtarget.hasFP16())
61782 break;
61783 if (VConstraint)
61784 return std::make_pair(0U, &X86::VR128XRegClass);
61785 return std::make_pair(0U, &X86::VR128RegClass);
61786 case MVT::v8bf16:
61787 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61788 break;
61789 if (VConstraint)
61790 return std::make_pair(0U, &X86::VR128XRegClass);
61791 return std::make_pair(0U, &X86::VR128RegClass);
61792 case MVT::f128:
61793 if (!Subtarget.is64Bit())
61794 break;
61795 [[fallthrough]];
61796 case MVT::v16i8:
61797 case MVT::v8i16:
61798 case MVT::v4i32:
61799 case MVT::v2i64:
61800 case MVT::v4f32:
61801 case MVT::v2f64:
61802 if (VConstraint && Subtarget.hasVLX())
61803 return std::make_pair(0U, &X86::VR128XRegClass);
61804 return std::make_pair(0U, &X86::VR128RegClass);
61805 // AVX types.
61806 case MVT::v16f16:
61807 if (!Subtarget.hasFP16())
61808 break;
61809 if (VConstraint)
61810 return std::make_pair(0U, &X86::VR256XRegClass);
61811 return std::make_pair(0U, &X86::VR256RegClass);
61812 case MVT::v16bf16:
61813 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61814 break;
61815 if (VConstraint)
61816 return std::make_pair(0U, &X86::VR256XRegClass);
61817 return std::make_pair(0U, &X86::VR256RegClass);
61818 case MVT::v32i8:
61819 case MVT::v16i16:
61820 case MVT::v8i32:
61821 case MVT::v4i64:
61822 case MVT::v8f32:
61823 case MVT::v4f64:
61824 if (VConstraint && Subtarget.hasVLX())
61825 return std::make_pair(0U, &X86::VR256XRegClass);
61826 if (Subtarget.hasAVX())
61827 return std::make_pair(0U, &X86::VR256RegClass);
61828 break;
61829 case MVT::v32f16:
61830 if (!Subtarget.hasFP16())
61831 break;
61832 if (VConstraint)
61833 return std::make_pair(0U, &X86::VR512RegClass);
61834 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61835 case MVT::v32bf16:
61836 if (!Subtarget.hasBF16())
61837 break;
61838 if (VConstraint)
61839 return std::make_pair(0U, &X86::VR512RegClass);
61840 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61841 case MVT::v64i8:
61842 case MVT::v32i16:
61843 case MVT::v8f64:
61844 case MVT::v16f32:
61845 case MVT::v16i32:
61846 case MVT::v8i64:
61847 if (!Subtarget.hasAVX512()) break;
61848 if (VConstraint)
61849 return std::make_pair(0U, &X86::VR512RegClass);
61850 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61851 }
61852 break;
61853 }
61854 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
61855 switch (Constraint[1]) {
61856 default:
61857 break;
61858 case 'i':
61859 case 't':
61860 case '2':
61861 return getRegForInlineAsmConstraint(TRI, "x", VT);
61862 case 'm':
61863 if (!Subtarget.hasMMX()) break;
61864 return std::make_pair(0U, &X86::VR64RegClass);
61865 case 'z':
61866 if (!Subtarget.hasSSE1()) break;
61867 switch (VT.SimpleTy) {
61868 default: break;
61869 // Scalar SSE types.
61870 case MVT::f16:
61871 if (!Subtarget.hasFP16())
61872 break;
61873 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
61874 case MVT::f32:
61875 case MVT::i32:
61876 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
61877 case MVT::f64:
61878 case MVT::i64:
61879 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
61880 case MVT::v8f16:
61881 if (!Subtarget.hasFP16())
61882 break;
61883 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61884 case MVT::v8bf16:
61885 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61886 break;
61887 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61888 case MVT::f128:
61889 case MVT::v16i8:
61890 case MVT::v8i16:
61891 case MVT::v4i32:
61892 case MVT::v2i64:
61893 case MVT::v4f32:
61894 case MVT::v2f64:
61895 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61896 // AVX types.
61897 case MVT::v16f16:
61898 if (!Subtarget.hasFP16())
61899 break;
61900 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61901 case MVT::v16bf16:
61902 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61903 break;
61904 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61905 case MVT::v32i8:
61906 case MVT::v16i16:
61907 case MVT::v8i32:
61908 case MVT::v4i64:
61909 case MVT::v8f32:
61910 case MVT::v4f64:
61911 if (Subtarget.hasAVX())
61912 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61913 break;
61914 case MVT::v32f16:
61915 if (!Subtarget.hasFP16())
61916 break;
61917 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61918 case MVT::v32bf16:
61919 if (!Subtarget.hasBF16())
61920 break;
61921 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61922 case MVT::v64i8:
61923 case MVT::v32i16:
61924 case MVT::v8f64:
61925 case MVT::v16f32:
61926 case MVT::v16i32:
61927 case MVT::v8i64:
61928 if (Subtarget.hasAVX512())
61929 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61930 break;
61931 }
61932 break;
61933 case 'k':
61934 // This register class doesn't allocate k0 for masked vector operation.
61935 if (Subtarget.hasAVX512()) {
61936 if (VT == MVT::v1i1 || VT == MVT::i1)
61937 return std::make_pair(0U, &X86::VK1WMRegClass);
61938 if (VT == MVT::v8i1 || VT == MVT::i8)
61939 return std::make_pair(0U, &X86::VK8WMRegClass);
61940 if (VT == MVT::v16i1 || VT == MVT::i16)
61941 return std::make_pair(0U, &X86::VK16WMRegClass);
61942 }
61943 if (Subtarget.hasBWI()) {
61944 if (VT == MVT::v32i1 || VT == MVT::i32)
61945 return std::make_pair(0U, &X86::VK32WMRegClass);
61946 if (VT == MVT::v64i1 || VT == MVT::i64)
61947 return std::make_pair(0U, &X86::VK64WMRegClass);
61948 }
61949 break;
61950 }
61951 } else if (Constraint.size() == 2 && Constraint[0] == 'j') {
61952 switch (Constraint[1]) {
61953 default:
61954 break;
61955 case 'r':
61956 if (VT == MVT::i8 || VT == MVT::i1)
61957 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
61958 if (VT == MVT::i16)
61959 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
61960 if (VT == MVT::i32 || VT == MVT::f32)
61961 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
61962 if (VT != MVT::f80 && !VT.isVector())
61963 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
61964 break;
61965 case 'R':
61966 if (VT == MVT::i8 || VT == MVT::i1)
61967 return std::make_pair(0U, &X86::GR8RegClass);
61968 if (VT == MVT::i16)
61969 return std::make_pair(0U, &X86::GR16RegClass);
61970 if (VT == MVT::i32 || VT == MVT::f32)
61971 return std::make_pair(0U, &X86::GR32RegClass);
61972 if (VT != MVT::f80 && !VT.isVector())
61973 return std::make_pair(0U, &X86::GR64RegClass);
61974 break;
61975 }
61976 }
61977
61978 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
61979 return std::make_pair(0U, &X86::GR32RegClass);
61980
61981 // Use the default implementation in TargetLowering to convert the register
61982 // constraint into a member of a register class.
61983 std::pair<Register, const TargetRegisterClass*> Res;
61985
61986 // Not found as a standard register?
61987 if (!Res.second) {
61988 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
61989 // to/from f80.
61990 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
61991 // Map st(0) -> st(7) -> ST0
61992 if (Constraint.size() == 7 && Constraint[0] == '{' &&
61993 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
61994 Constraint[3] == '(' &&
61995 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
61996 Constraint[5] == ')' && Constraint[6] == '}') {
61997 // st(7) is not allocatable and thus not a member of RFP80. Return
61998 // singleton class in cases where we have a reference to it.
61999 if (Constraint[4] == '7')
62000 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
62001 return std::make_pair(X86::FP0 + Constraint[4] - '0',
62002 &X86::RFP80RegClass);
62003 }
62004
62005 // GCC allows "st(0)" to be called just plain "st".
62006 if (StringRef("{st}").equals_insensitive(Constraint))
62007 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
62008 }
62009
62010 // flags -> EFLAGS
62011 if (StringRef("{flags}").equals_insensitive(Constraint))
62012 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
62013
62014 // dirflag -> DF
62015 // Only allow for clobber.
62016 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
62017 VT == MVT::Other)
62018 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
62019
62020 // fpsr -> FPSW
62021 // Only allow for clobber.
62022 if (StringRef("{fpsr}").equals_insensitive(Constraint) && VT == MVT::Other)
62023 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
62024
62025 return Res;
62026 }
62027
62028 // Make sure it isn't a register that requires 64-bit mode.
62029 if (!Subtarget.is64Bit() &&
62030 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
62031 TRI->getEncodingValue(Res.first) >= 8) {
62032 // Register requires REX prefix, but we're in 32-bit mode.
62033 return std::make_pair(0, nullptr);
62034 }
62035
62036 // Make sure it isn't a register that requires AVX512.
62037 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
62038 TRI->getEncodingValue(Res.first) & 0x10) {
62039 // Register requires EVEX prefix.
62040 return std::make_pair(0, nullptr);
62041 }
62042
62043 // Otherwise, check to see if this is a register class of the wrong value
62044 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
62045 // turn into {ax},{dx}.
62046 // MVT::Other is used to specify clobber names.
62047 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
62048 return Res; // Correct type already, nothing to do.
62049
62050 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
62051 // return "eax". This should even work for things like getting 64bit integer
62052 // registers when given an f64 type.
62053 const TargetRegisterClass *Class = Res.second;
62054 // The generic code will match the first register class that contains the
62055 // given register. Thus, based on the ordering of the tablegened file,
62056 // the "plain" GR classes might not come first.
62057 // Therefore, use a helper method.
62058 if (isGRClass(*Class)) {
62059 unsigned Size = VT.getSizeInBits();
62060 if (Size == 1) Size = 8;
62061 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
62062 return std::make_pair(0, nullptr);
62063 Register DestReg = getX86SubSuperRegister(Res.first, Size);
62064 if (DestReg.isValid()) {
62065 bool is64Bit = Subtarget.is64Bit();
62066 const TargetRegisterClass *RC =
62067 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
62068 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
62069 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
62070 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
62071 if (Size == 64 && !is64Bit) {
62072 // Model GCC's behavior here and select a fixed pair of 32-bit
62073 // registers.
62074 switch (DestReg) {
62075 case X86::RAX:
62076 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
62077 case X86::RDX:
62078 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
62079 case X86::RCX:
62080 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
62081 case X86::RBX:
62082 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
62083 case X86::RSI:
62084 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
62085 case X86::RDI:
62086 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
62087 case X86::RBP:
62088 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
62089 default:
62090 return std::make_pair(0, nullptr);
62091 }
62092 }
62093 if (RC && RC->contains(DestReg))
62094 return std::make_pair(DestReg, RC);
62095 return Res;
62096 }
62097 // No register found/type mismatch.
62098 return std::make_pair(0, nullptr);
62099 } else if (isFRClass(*Class)) {
62100 // Handle references to XMM physical registers that got mapped into the
62101 // wrong class. This can happen with constraints like {xmm0} where the
62102 // target independent register mapper will just pick the first match it can
62103 // find, ignoring the required type.
62104
62105 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
62106 if (VT == MVT::f16)
62107 Res.second = &X86::FR16XRegClass;
62108 else if (VT == MVT::f32 || VT == MVT::i32)
62109 Res.second = &X86::FR32XRegClass;
62110 else if (VT == MVT::f64 || VT == MVT::i64)
62111 Res.second = &X86::FR64XRegClass;
62112 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
62113 Res.second = &X86::VR128XRegClass;
62114 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
62115 Res.second = &X86::VR256XRegClass;
62116 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
62117 Res.second = &X86::VR512RegClass;
62118 else {
62119 // Type mismatch and not a clobber: Return an error;
62120 Res.first = 0;
62121 Res.second = nullptr;
62122 }
62123 } else if (isVKClass(*Class)) {
62124 if (VT == MVT::v1i1 || VT == MVT::i1)
62125 Res.second = &X86::VK1RegClass;
62126 else if (VT == MVT::v8i1 || VT == MVT::i8)
62127 Res.second = &X86::VK8RegClass;
62128 else if (VT == MVT::v16i1 || VT == MVT::i16)
62129 Res.second = &X86::VK16RegClass;
62130 else if (VT == MVT::v32i1 || VT == MVT::i32)
62131 Res.second = &X86::VK32RegClass;
62132 else if (VT == MVT::v64i1 || VT == MVT::i64)
62133 Res.second = &X86::VK64RegClass;
62134 else {
62135 // Type mismatch and not a clobber: Return an error;
62136 Res.first = 0;
62137 Res.second = nullptr;
62138 }
62139 }
62140
62141 return Res;
62142}
62143
62144bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
62145 // Integer division on x86 is expensive. However, when aggressively optimizing
62146 // for code size, we prefer to use a div instruction, as it is usually smaller
62147 // than the alternative sequence.
62148 // The exception to this is vector division. Since x86 doesn't have vector
62149 // integer division, leaving the division as-is is a loss even in terms of
62150 // size, because it will have to be scalarized, while the alternative code
62151 // sequence can be performed in vector form.
62152 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
62153 return OptSize && !VT.isVector();
62154}
62155
62156void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
62157 if (!Subtarget.is64Bit())
62158 return;
62159
62160 // Update IsSplitCSR in X86MachineFunctionInfo.
62162 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
62163 AFI->setIsSplitCSR(true);
62164}
62165
62166void X86TargetLowering::insertCopiesSplitCSR(
62167 MachineBasicBlock *Entry,
62168 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
62169 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
62170 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
62171 if (!IStart)
62172 return;
62173
62174 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
62175 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
62176 MachineBasicBlock::iterator MBBI = Entry->begin();
62177 for (const MCPhysReg *I = IStart; *I; ++I) {
62178 const TargetRegisterClass *RC = nullptr;
62179 if (X86::GR64RegClass.contains(*I))
62180 RC = &X86::GR64RegClass;
62181 else
62182 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
62183
62184 Register NewVR = MRI->createVirtualRegister(RC);
62185 // Create copy from CSR to a virtual register.
62186 // FIXME: this currently does not emit CFI pseudo-instructions, it works
62187 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
62188 // nounwind. If we want to generalize this later, we may need to emit
62189 // CFI pseudo-instructions.
62190 assert(
62191 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
62192 "Function should be nounwind in insertCopiesSplitCSR!");
62193 Entry->addLiveIn(*I);
62194 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
62195 .addReg(*I);
62196
62197 // Insert the copy-back instructions right before the terminator.
62198 for (auto *Exit : Exits)
62199 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
62200 TII->get(TargetOpcode::COPY), *I)
62201 .addReg(NewVR);
62202 }
62203}
62204
62206 return Subtarget.is64Bit();
62207}
62208
62212 const TargetInstrInfo *TII) const {
62213 assert(MBBI->isCall() && MBBI->getCFIType() &&
62214 "Invalid call instruction for a KCFI check");
62215
62216 MachineFunction &MF = *MBB.getParent();
62217 // If the call target is a memory operand, unfold it and use R11 for the
62218 // call, so KCFI_CHECK won't have to recompute the address.
62219 switch (MBBI->getOpcode()) {
62220 case X86::CALL64m:
62221 case X86::CALL64m_NT:
62222 case X86::TAILJMPm64:
62223 case X86::TAILJMPm64_REX: {
62226 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
62227 /*UnfoldStore=*/false, NewMIs))
62228 report_fatal_error("Failed to unfold memory operand for a KCFI check");
62229 for (auto *NewMI : NewMIs)
62230 MBBI = MBB.insert(OrigCall, NewMI);
62231 assert(MBBI->isCall() &&
62232 "Unexpected instruction after memory operand unfolding");
62233 if (OrigCall->shouldUpdateAdditionalCallInfo())
62234 MF.moveAdditionalCallInfo(&*OrigCall, &*MBBI);
62235 MBBI->setCFIType(MF, OrigCall->getCFIType());
62236 OrigCall->eraseFromParent();
62237 break;
62238 }
62239 default:
62240 break;
62241 }
62242
62243 MachineOperand &Target = MBBI->getOperand(0);
62244 Register TargetReg;
62245 switch (MBBI->getOpcode()) {
62246 case X86::CALL64r:
62247 case X86::CALL64r_ImpCall:
62248 case X86::CALL64r_NT:
62249 case X86::TAILJMPr64:
62250 case X86::TAILJMPr64_REX:
62251 assert(Target.isReg() && "Unexpected target operand for an indirect call");
62252 Target.setIsRenamable(false);
62253 TargetReg = Target.getReg();
62254 break;
62255 case X86::CALL64pcrel32:
62256 case X86::TAILJMPd64:
62257 assert(Target.isSymbol() && "Unexpected target operand for a direct call");
62258 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
62259 // 64-bit indirect thunk calls.
62260 assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
62261 "Unexpected register for an indirect thunk call");
62262 TargetReg = X86::R11;
62263 break;
62264 default:
62265 llvm_unreachable("Unexpected CFI call opcode");
62266 break;
62267 }
62268
62269 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
62270 .addReg(TargetReg)
62271 .addImm(MBBI->getCFIType())
62272 .getInstr();
62273}
62274
62275/// Returns true if stack probing through a function call is requested.
62279
62280/// Returns true if stack probing through inline assembly is requested.
62282
62283 // No inline stack probe for Windows, they have their own mechanism.
62284 if (Subtarget.isOSWindows() || Subtarget.isUEFI() ||
62285 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62286 return false;
62287
62288 // If the function specifically requests inline stack probes, emit them.
62289 if (MF.getFunction().hasFnAttribute("probe-stack"))
62290 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
62291 "inline-asm";
62292
62293 return false;
62294}
62295
62296/// Returns the name of the symbol used to emit stack probes or the empty
62297/// string if not applicable.
62300 // Inline Stack probes disable stack probe call
62301 if (hasInlineStackProbe(MF))
62302 return "";
62303
62304 // If the function specifically requests stack probes, emit them.
62305 if (MF.getFunction().hasFnAttribute("probe-stack"))
62306 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
62307
62308 // Generally, if we aren't on Windows, the platform ABI does not include
62309 // support for stack probes, so don't emit them.
62310 if ((!Subtarget.isOSWindows() && !Subtarget.isUEFI()) ||
62311 Subtarget.isTargetMachO() ||
62312 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62313 return "";
62314
62315 // We need a stack probe to conform to the Windows ABI. Choose the right
62316 // symbol.
62317 if (Subtarget.is64Bit())
62318 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
62319 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
62320}
62321
62322unsigned
62324 // The default stack probe size is 4096 if the function has no stackprobesize
62325 // attribute.
62326 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
62327 4096);
62328}
62329
62331 if (ML && ML->isInnermost() &&
62332 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
62335}
unsigned const MachineRegisterInfo * MRI
#define Success
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
return SDValue()
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
#define NODE_NAME_CASE(node)
constexpr LLT F64
constexpr LLT S1
AMDGPU Register Bank Select
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
#define EXPAND(Op)
Function Alias Analysis Results
BitTracker BT
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isSigned(unsigned int Opcode)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static int matchShuffleAsBitRotate(ArrayRef< int > Mask, int NumSubElts)
Try to lower a vector shuffle as a bit rotation.
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
static Value * LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctlz of V before the specified instruction IP.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
Live Register Matrix
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 128-bit LoongArch vector shuffles.
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, unsigned Depth)
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL)
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 256-bit LoongArch vector shuffles.
static void computeZeroableShuffleElements(ArrayRef< int > Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero)
Compute whether each element of a shuffle is zeroable.
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Attempts to match vector shuffle as byte rotation.
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
Attempts to match a shuffle mask against the VBSLL, VBSRL, VSLLI and VSRLI instruction.
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each sub-lane.
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc)
Return true if node is an ISD::AND or ISD::OR of two M68k::SETcc nodes each of which has no other use...
static bool hasNonFlagsUse(SDValue Op)
return true if Op has a use that doesn't just read flags.
static bool isCMOVPseudo(MachineInstr &MI)
static SDValue combineCarryThroughADD(SDValue CCR)
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG)
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
#define T1
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
static constexpr MCPhysReg SPReg
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc)
static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
Try to map an integer comparison with size > XLEN to vector instructions before type legalization spl...
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
Contains matchers for matching SelectionDAG nodes and values.
static bool isSimple(Instruction *I)
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
unsigned OpIndex
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file defines the SmallVector class.
static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
static bool Enabled
Definition Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
static const char LUT[]
static llvm::Type * getVectorElementType(llvm::Type *Ty)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
static KnownBits computeKnownBitsForHorizontalOperation(const Operator *I, const APInt &DemandedElts, const SimplifyQuery &Q, unsigned Depth, const function_ref< KnownBits(const KnownBits &, const KnownBits &)> KnownBitsFunc)
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &DL, unsigned VectorWidth)
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static unsigned getSUBriOpcode(bool IsLP64)
static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If both input operands of a logic op are being cast from floating-point types or FP compares,...
static bool isNoopOrBroadcastShuffleMask(ArrayRef< int > Mask)
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask)
static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget)
Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer t...
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::ANDNP nodes.
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, SDValue X, SDValue Y, SelectionDAG &DAG, bool ZeroSecondOpOnly=false)
If this is an add or subtract where one operand is produced by a cmp+setcc, then try to convert it to...
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl< SDValue > &SrcOps, SmallVectorImpl< APInt > *SrcMask=nullptr)
Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) style scalarized (associative) ...
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling)
Turns an ISD::CondCode into a value suitable for SSE floating-point mask CMPs.
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC)
static bool useEGPRInlineAsm(const X86Subtarget &Subtarget)
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If a value is a scalar FP zero or a vector FP zero (potentially including undefined elements),...
static bool matchBinaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isGRClass(const TargetRegisterClass &RC)
Check if RC is a general purpose register class.
static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl< SDValue > &Ops, SmallVectorImpl< int > &Mask, bool &IsUnary)
Calculates the shuffle mask corresponding to the target-specific opcode.
static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast operation that is extracted from a vector, try to vectorize the cast op followed ...
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG)
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1)
This is a helper function of LowerToHorizontalOp().
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG)
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat=false)
Given the output values from getHalfShuffleMask(), create a half width shuffle of extracted vectors f...
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFPToSInt(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle vector element shifts by a splat shift amount.
@ ConstantBit
@ NotConstantBit
@ NotShiftBit
static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, bool NSW)
Given a buildvector constant, return a new vector constant with each element incremented or decrement...
static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode)
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane 32-bit floating point shuffles.
static MachineBasicBlock * emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII)
Utility function to emit xbegin specifying the start of an RTM region.
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef< SDValue > Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
Given the initializing elements 'Elts' of a vector of type 'VT', see if the elements can be replaced ...
static bool scaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts, SmallVectorImpl< int > &ScaledMask)
static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LoadGlobalBaseReg=false, bool LocalDynamic=false)
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > BrMergingCcmpBias("x86-br-merging-ccmp-bias", cl::init(6), cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target " "supports conditional compare instructions."), cl::Hidden)
static APInt getExtractedDemandedElts(SDNode *N)
static SDValue combineAndMaskToShift(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If this is a zero/all-bits result that is bitwise-anded with a low bits mask.
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit integer shuffles.
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef< SDValue > Ops, ArrayRef< int > Mask, ArrayRef< const SDNode * > SrcNodes, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we are inverting an PTEST/TESTP operand, attempt to adjust the CC to avoid the inversion.
static unsigned getAltBitOpcode(unsigned Opcode)
static Constant * getConstantVector(MVT VT, ArrayRef< APInt > Bits, const APInt &Undefs, LLVMContext &C)
static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert i1-subvector to i1-vector.
static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Create a vector constant without a load.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
Helper that combines an array of subvector ops as if they were the operands of a ISD::CONCAT_VECTORS ...
static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
64-bit unsigned integer to double expansion.
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget)
static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDNodeFlags Flags=SDNodeFlags())
This function lowers a vector truncation of 'extended sign-bits' or 'extended zero-bits' values.
static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on SELECT and VSELECT nodes.
static bool isUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is undef or ...
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
static SDValue getConstVector(ArrayRef< int > Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask=false)
static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB)
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to put 128-bits into a vector > 128 bits.
static bool onlyZeroFlagUsed(SDValue Flags)
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isFreeToSplitVector(SDValue V, SelectionDAG &DAG)
static SDValue lowerShuffleAsLanePermuteAndShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one source with a lane permutatio...
static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG)
static bool isFoldableUseOfShuffle(SDNode *N)
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return (and Op, Mask) for compare instructions or (vselect Mask, Op, PreservedSrc) for others along w...
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg sign extension and X86ISD::PACKSS.
static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isShuffleMaskInputInPlace(int Input, ArrayRef< int > Mask)
Test whether the specified input (0 or 1) is in-place blended by the given mask.
static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether elements in each LaneSizeInBits lane in this shuffle mask come from multiple lanes - thi...
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
As another special case, use PSUBUS[BW] when it's profitable.
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static APInt getBLENDIBlendMask(SDValue V)
Get the expanded blend mask from a BLENDI node.
static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "test Op0,Op0", or something equivalent.
static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 128-bit lane.
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS, unsigned X86CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< unsigned > CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS, unsigned NumSignificantBitsRHS)
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static bool isShuffleFoldableLoad(SDValue)
Helper to test for a load that can be folded with x86 shuffles.
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
If both arms of a vector select are concatenated vectors, split the select, and concatenate the resul...
static SDValue lowerShuffleAsElementInsertion(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower insertion of a single element into a zero vector.
static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnpackWdShuffleMask(ArrayRef< int > Mask, MVT VT, const SelectionDAG &DAG)
static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into X86ISD::PACKUS/X86ISDPAC...
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle case where shuffle sources are coming from the same 128-bit lane and every lane can be represe...
static SDValue getSHUFPDImmForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static int getSEHRegistrationNodeSize(const Function *Fn)
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Creates an SDNode for a predicated scalar operation.
static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG)
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
If a BUILD_VECTOR's source elements all apply the same bit operation and one of their operands is con...
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth=0)
Returns the negated value if the node N flips sign of FP value.
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef< int > OriginalMask, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 16-bit integer shuffles.
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 8-bit integer shuffles.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single - truncated - integer element, coming from a scalar_to_vector/buil...
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd, bool &HasAllowContract)
Returns true iff BV builds a vector with the result equivalent to the result of ADDSUB/SUBADD operati...
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI)
Emit a sequence of two 128-bit horizontal add/sub followed by a concat_vector.
static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, MVT VT, unsigned Amt=0)
static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to fold: and (vector_shuffle<Z,...,Z> (insert_vector_elt undef, (xor X, -1), Z),...
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a bitmask instruction for a shuffle.
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) followed by unpack 256-bit.
static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 256-bit lane.
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef< int > Mask)
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
32-bit unsigned integer to float expansion.
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > ExperimentalPrefInnermostLoopAlignment("x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden)
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute from a vector of source v...
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1)
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool needCarryOrOverflowFlag(SDValue Flags)
static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
Returns a vector of specified type with all bits set.
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefLowerHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose lower half is undefined.
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Search for a combinable shuffle across a chain ending in pshufd.
static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, SDValue OpMustEq, SDValue Op, unsigned Depth)
static SDValue createPSADBW(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, X86Subtarget const &Subtarget, SelectionDAG &DAG)
Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats representing a blend.
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG)
Handle vector element shifts where the shift amount is a constant.
static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf=false)
Returns a node that packs the LHS + RHS nodes together at half width.
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG)
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue V1, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT)
static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts, bool AllowUndefs)
static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), try to vectorize the cas...
static SDValue combineAndXorSubWithBMI(SDNode *And, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
static SDValue combineX86SubCmpForFlags(SDNode *N, SDValue Flag, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerVectorCTLZ_GFNI(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getHalfShuffleMask(ArrayRef< int > Mask, MutableArrayRef< int > HalfMask, int &HalfIdx1, int &HalfIdx2)
If the input shuffle mask results in a vector that is undefined in all upper or lower half elements a...
static cl::opt< int > BrMergingBaseCostThresh("x86-br-merging-base-cost", cl::init(2), cl::desc("Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden)
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT)
static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, const SDLoc &DL)
Emit a locked operation on a stack location which does not change any memory location,...
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef< int > Mask, const APInt &Zeroable)
static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 8-lane 16-bit floating point shuffles.
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, F Builder, bool CheckBWI=true, bool AllowAVX512=true)
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle using bit math.
static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-extending masked load, it is a scalar load and ve...
static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics with chain that return their value into registers EDX:EAX.
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI)
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If this is a dynamic select (non-constant condition) and we can match this node with one of the varia...
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a target shuffle mask is equivalent within each sub-lane.
static const char * getIndirectThunkSymbol(const X86Subtarget &Subtarget, Register Reg)
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether there are elements crossing LaneSizeInBits lanes in this shuffle mask.
static SDValue FixupMMXIntrinsicTypes(SDNode *N, SelectionDAG &DAG)
static bool isShuffleMaskInputBroadcastable(int Input, ArrayRef< int > Mask, int BroadcastableElement=0)
Test whether the specified input (0 or 1) is a broadcast/splat blended by the given mask.
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, X86::CondCode &X86CC)
Result of 'and' is compared against zero.
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsZeroOrAnyExtend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a zero extension on any microarch.
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Compute the horizontal sum of bytes in V for the elements of VT.
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG)
static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG)
static void growShuffleMask(ArrayRef< int > SrcMask, SmallVectorImpl< int > &DstMask, unsigned SrcSizeInBits, unsigned DstSizeInBits)
static SDValue lowerShuffleWithEXPAND(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static void computeInLaneShuffleMask(const ArrayRef< int > &Mask, int LaneSize, SmallVector< int > &InLaneMask)
Helper to get compute inlane shuffle mask for a complete shuffle mask.
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isX86CCSigned(X86::CondCode X86CC)
Return true if the condition is an signed comparison operation.
static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG)
static bool isUndefUpperHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose upper half is undefined.
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt=0)
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsSpecificExtension(const SDLoc &DL, MVT VT, int Scale, int Offset, unsigned ExtOpc, SDValue InputV, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle as an any/signed/zero extension.
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG)
Lower SRA_PARTS and friends, which return two i32 values and take a 2 x i32 value to shift plus a shi...
static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode)
static std::pair< SDValue, SDValue > getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs reference the same FP CMP,...
static bool isVKClass(const TargetRegisterClass &RC)
Check if RC is a mask register class.
static int canLowerByDroppingElements(ArrayRef< int > Mask, bool MatchEven, bool IsSingleInput)
Check whether a compaction lowering can be done by dropping even/odd elements and compute how many ti...
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Attempt to pre-truncate inputs to arithmetic ops if it will simplify the codegen.
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single element.
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask)
Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineAndNotIntoANDNP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to combine a shuffle into a target-specific add-sub or mul-add-sub node.
static SDValue lowerShuffleAsLanePermuteAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes as a lane permutation followed by a per-lane p...
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of 8-lane i16 shuffles.
static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG)
static bool canonicalizeShuffleMaskWithCommute(ArrayRef< int > Mask)
Helper function that returns true if the shuffle mask should be commuted to improve canonicalization.
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static SDValue splitVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SelectionDAG &DAG, const SDLoc &dl)
Break a VSETCC 256/512-bit vector into two new 128/256 ones and then concatenate the result back.
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG)
Change a vector store into a pair of half-size vector stores.
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a vector to a larger size with the same scalar type, with the new elements either zero or undef...
static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static bool isUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FANDN nodes.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC)
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendBoolVectorInReg(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break a binary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isBlendOrUndef(ArrayRef< int > Mask)
Return true if every element in Mask, is an in-place blend/select mask or is undef.
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static unsigned getV4X86ShuffleImm(ArrayRef< int > Mask)
Get a 4-lane 8-bit shuffle immediate for a mask.
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveTargetShuffleFromZeroables(SmallVectorImpl< int > &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros=true)
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert one bit to mask vector, like v16i1 or v8i1.
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle by first fixing the 128-bit lanes and then shuffling each lane.
static bool isSoftF16(T VT, const X86Subtarget &Subtarget)
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Detect vector gather/scatter index generation and convert it from being a bunch of shuffles and extra...
static bool isSingleSHUFPSMask(ArrayRef< int > Mask)
Test whether this can be lowered with a single SHUFPS instruction.
static SDValue LowerFCanonicalize(SDValue Op, SelectionDAG &DAG)
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd)
Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
static bool isX86LogicalCmp(SDValue Op)
Return true if opcode is a X86 logical comparison.
static bool isAnyInRange(ArrayRef< int > Mask, int Low, int Hi)
Return true if the value of any element in Mask falls within the specified range (L,...
static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static cl::opt< bool > WidenShift("x86-widen-shift", cl::init(true), cl::desc("Replace narrow shifts with wider shifts."), cl::Hidden)
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG)
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS=false)
Detect patterns of truncation with signed saturation: (truncate (smin ((smax (x, signed_min_of_dest_t...
const unsigned FPStateSize
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef< int > TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating point negations.
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth)
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1)
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool createShuffleMaskFromVSELECT(SmallVectorImpl< int > &Mask, SDValue Cond, bool IsBLENDV=false)
static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Return Mask with the necessary casting or extending for Mask according to MaskVT when lowering maskin...
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit floating point shuffles.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Horizontal vector math instructions may be slower than normal math with shuffles.
static bool isFRClass(const TargetRegisterClass &RC)
Check if RC is a vector register class.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool SimpleOnly)
Generic routine to split vector shuffle into half-sized shuffles.
static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue IsNOT(SDValue V, SelectionDAG &DAG)
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG)
Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl)
Return a vector logical shift node.
static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_WO_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane i32 vector shuffles.
static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer types.
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isInRange(int Val, int Low, int Hi)
Return true if Val falls within the specified range (L, H].
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Try to combine x86 target specific shuffles.
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static std::pair< SDValue, SDValue > splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG)
Helper for attempting to create a X86ISD::BT node.
static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Truncating Store with signed or unsigned saturation.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes=false)
Widen a vector input to a vector of NVT.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool ImmBlends=false)
Try to lower as a blend of elements from two inputs followed by a single-input permutation.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable)
const unsigned X87StateSize
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit integer shuffles.
static bool isUndefOrEqual(int Val, int CmpVal)
Val is the undef sentinel value or equal to the specified value.
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isTargetShuffle(unsigned Opcode)
static bool isSingleElementRepeatedMask(ArrayRef< int > Mask)
Check if the Mask consists of the same element repeated multiple times.
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG)
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShufflesRecursively(ArrayRef< SDValue > SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT, ArrayRef< int > RootMask, ArrayRef< const SDNode * > SrcNodes, unsigned Depth, unsigned MaxDepth, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Fully generic combining of x86 shuffle instructions.
static SDValue LowerIntVSETCC_AVX512(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef< int > Mask, SelectionDAG &DAG)
If we are extracting two 128-bit halves of a vector and shuffling the result, match that to a 256-bit...
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit floating point shuffles.
static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or 'fsubadd' operation accordingly...
static SDValue lowerV8I16GeneralSingleInputShuffle(const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 shuffle lowering,...
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit floating point shuffles.
static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG)
static cl::opt< int > BrMergingLikelyBias("x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden)
static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG)
static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx)
Checks whether the vector elements referenced by two shuffle masks are equivalent.
static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to match a vector shuffle as an element rotation.
static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi)
Return true if Val is undef, zero or if its value falls within the specified range (L,...
static const Constant * getTargetConstantFromBasePtr(SDValue Ptr)
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle.
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset)
static bool isUndefOrInRange(int Val, int Low, int Hi)
Return true if Val is undef or if its value falls within the specified range (L, H].
static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT)
static bool collectConcatOps(SDNode *N, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG)
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG)
static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static std::pair< Value *, BitTestKind > FindSingleBitChange(Value *V)
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG)
If we are converting a value to floating-point, try to replace scalar truncate of an extracted vector...
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef< int > Mask)
Test whether there are elements crossing 128-bit lanes in this shuffle mask.
static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "cmp Op0,Op1", or something equivalent.
static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit integer shuffles.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG)
const unsigned FPStateSizeInBits
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-truncating masked store, it is a vector extract a...
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode)
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL, SelectionDAG &DAG)
If we are extracting a subvector of a vector select and the select condition is composed of concatena...
static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNoopShuffleMask(ArrayRef< int > Mask)
Tiny helper function to identify a no-op mask.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackh operation.
static SDValue combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a byte shift sequence.
static SDValue combineFP_TO_xINT_SAT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffleVariableMask(unsigned Opcode)
static bool isLogicOp(unsigned Opcode)
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly)
static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v8i16.
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary)
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to lower as an unpack of elements from two inputs followed by a single-input permutation.
static bool canScaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts)
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG)
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return a vector_shuffle of the specified vector of zero or undef vector.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Attempt to use the vbroadcast instruction to generate a splat value from a splat BUILD_VECTOR which u...
static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG)
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative, SmallVectorImpl< int > &PostShuffleMask, bool ForceHorizOp)
Return 'true' if this vector operation is "horizontal" and return the operands for the horizontal ope...
static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl< uint64_t > &RawMask, APInt &UndefElts)
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget)
sext(add_nsw(x, C)) --> add(sext(x), C_sext) zext(add_nuw(x, C)) --> add(zext(x), C_zext) Promoting a...
static const Constant * getTargetConstantFromNode(LoadSDNode *Load)
static SDValue canonicalizeBitSelect(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canCombineAsMaskOperation(SDValue V, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a dword/qword rotation.
static SDValue lowerVECTOR_COMPRESS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isProfitableToUseFlagOp(SDValue Op)
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG)
ISD::FROUND is defined to round to nearest with ties rounding away from 0.
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
Detect patterns of truncation with unsigned saturation:
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG)
If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the low half of each source v...
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG)
Do a one-to-one translation of a ISD::CondCode to the X86-specific condition code,...
static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL, SDValue Mask)
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 512-bit x86 vector shuffles.
static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v16i8.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl< APInt > &EltBits, bool AllowWholeUndefs=true, bool AllowPartialUndefs=false)
static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, SDValue &Op1)
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Depending on uarch and/or optimizing for size, we might prefer to use a vector operation in place of ...
static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, Align &Alignment, unsigned &Offset)
Given a masked memory load/store operation, return true if it has one mask bit set.
static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
When the operands of vector mul are extended from smaller size values, like i8 and i16,...
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG)
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned, const X86Subtarget &Subtarget)
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, unsigned &LogBias, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute)
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG)
The only differences between FABS and FNEG are the mask and the logic op.
ShrinkMode
Different mul shrinking modes.
static SDValue combineVPMADD52LH(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl)
static SDValue combineINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue canonicalizeShuffleMaskWithHorizOp(MutableArrayRef< SDValue > Ops, MutableArrayRef< int > Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a VSELECT instruction to a vector shuffle.
static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef< int > Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask)
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, const SDLoc &DL)
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL)
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackl operation.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG)
Try to get a scalar value for a specific element of a vector.
static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc)
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of v16i8 shuffles.
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDNodeFlags Flags=SDNodeFlags())
Helper to determine if In truncated to DstVT has the necessary signbits / leading zero bits to be tru...
static unsigned getSHUFPDImm(ArrayRef< int > Mask)
static bool isNullFPScalarOrVectorConst(SDValue V)
static bool hasIdenticalHalvesShuffleMask(ArrayRef< int > Mask)
Return true if a shuffle mask chooses elements identically in its top and bottom halves.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef< int > TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages=1)
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget)
static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth)
static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to convert a vector reduction sequence composed of binops and shuffles into horizontal ops.
static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffle using X86ISD::VROTLI rotations.
static SDValue lowerShuffleAsDecomposedShuffleMerge(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic routine to decompose a shuffle and blend into independent blends and permutes.
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT)
static SDValue combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1, ArrayRef< int > BlendMask, const APInt &DemandedElts, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine: (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) to: (brcond/cmov/setcc ....
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize an EFLAGS definition used according to the condition code CC into a simpler EFLAGS value,...
static bool isBroadcastShuffleMask(ArrayRef< int > Mask)
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXor1SetCC(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
static SDValue MatchVectorAllEqualTest(SDValue OrigLHS, SDValue OrigRHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static StringRef getInstrStrFromOpNo(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo)
static bool isSequentialOrUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size,...
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Either split a vector in halves or decompose the shuffles and the blend/unpack.
static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG)
static bool canWidenShuffleElements(ArrayRef< int > Mask, SmallVectorImpl< int > &WidenedMask)
Helper function to test whether a shuffle mask could be simplified by widening the elements being shu...
static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an unary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit integer shuffles.
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoConditionalNegate(EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth)
Returns the scalar element that will make up the i'th element of the result of the vector shuffle.
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable)
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef< int > Mask, const EVT &VectorType, bool &IsZeroSideLeft)
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV)
Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp expansion.
static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just pre-promote its result type since...
static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle as a permute of the inputs followed by an UNPCK instruction.
static SDValue combineAndOrForCcmpCtest(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SelectionDAG &DAG)
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget, TargetLowering::DAGCombinerInfo &DCI)
Extracting a scalar FP value from vector element 0 is free, so extract each operand first,...
static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isAddSubOrSubAddMask(ArrayRef< int > Mask, bool &Op0Even)
Checks if the shuffle mask takes subsequent elements alternately from two vectors.
static bool isCompletePermute(ArrayRef< int > Mask)
Return true if every element of a single input is referenced by the shuffle mask.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP)
When the MSVC runtime transfers control to us, either to an outlined function or when returning to a ...
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics that read the time stamp counter (x86_rdtsc and x86_rdtscp...
static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, ISD::CondCode CC, const APInt &OriginalMask, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static bool is128BitUnpackShuffleMask(ArrayRef< int > Mask, const SelectionDAG &DAG)
static bool isOrXorXorTree(SDValue X, bool Root=true)
Recursive helper for combineVectorSizedSetCCEquality() to see if we have a recognizable memcmp expans...
static SDValue LowerAVXExtend(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static ConstantPoolSDNode * getTargetConstantPoolFromBasePtr(SDValue Ptr)
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, const SDLoc &DL)
Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
static bool isShuffleEquivalent(ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a shuffle mask is equivalent to an explicit list of arguments.
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit floating point shuffles.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then permuting the elements of th...
static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShuffleChain(ArrayRef< SDValue > Inputs, unsigned RootOpc, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Combine an arbitrary chain of shuffles into a single instruction if possible.
static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers)
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses, bool AllowSubAddOrAddSubContract)
Returns true if is possible to fold MUL and an idiom that has already been recognized as ADDSUB/SUBAD...
static void createPackShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Unary, unsigned NumStages=1)
Create a shuffle mask that matches the PACKSS/PACKUS truncation.
static bool isUndefOrEqualInRange(ArrayRef< int > Mask, int CmpVal, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating-point adds/subs.
static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an operation into 2 half sized ops and then concatenate the results.
static cl::opt< bool > MulConstantOptimization("mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden)
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld)
static bool isAnyZero(ArrayRef< int > Mask)
Return true if the value of any element in Mask is the zero sentinel value.
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget)
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl< int > &Mask, APInt &KnownUndef, APInt &KnownZero)
static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, SelectionDAG &DAG)
static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SmallVector< int, 4 > getPSHUFShuffleMask(SDValue N)
Get the PSHUF-style mask from PSHUF node.
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG)
Scalarize a vector store, bitcasting to TargetVT to determine the scalar type.
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isUndefOrZero(int Val)
Val is either the undef or zero sentinel value.
static SDValue combineAndNotOrIntoAndNotAnd(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z)) This undoes the inverse fold performed in InstCom...
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL].
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 128-bits from a vector > 128 bits.
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC)
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Lower a vector shuffle using the SHUFPS instruction.
static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShuffleChainWithExtract(ArrayRef< SDValue > Inputs, unsigned RootOpcode, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isHorizOp(unsigned Opcode)
static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector CTLZ using native supported vector CTLZ instruction.
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extract one bit from mask vector, like v16i1 or v8i1.
static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, MVT VT, bool IsSigned, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low=nullptr)
static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse)
Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the blend if only one input i...
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx)
static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG)
static SDValue combineAVG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isSequentialOrUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos + Size,...
static cl::opt< int > BrMergingUnlikelyBias("x86-br-merging-unlikely-bias", cl::init(-1), cl::desc("Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden)
static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, const SDLoc &DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, APInt &KnownUndef, APInt &KnownZero)
Decode a target shuffle mask and inputs and see if any values are known to be undef or zero from thei...
static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v4i32 or v4f32.
static bool isTargetShuffleEquivalent(MVT VT, ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, const SelectionDAG &DAG, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a target shuffle mask is equivalent to an explicit pattern.
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
CMOV of constants requires materializing constant operands in registers.
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool Is64Bit, bool Is64BitLP64)
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx)
For an EXTRACT_VECTOR_ELT with a constant index return the real underlying vector and index.
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnaryOp(unsigned Opcode)
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize branch condition evaluation.
static bool hasFPCMov(unsigned X86CC)
Is there a floating point cmov for the specific X86 condition code?
static int getOneTrueElt(SDValue V)
If V is a build vector of boolean constants and exactly one of those constants is true,...
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to turn tests against the signbit in the form of: XOR(TRUNCATE(SRL(X, size(X)-1)),...
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
if(isa< SExtInst >(LHS)) std auto IsFreeTruncation
static const unsigned FramePtr
The Input class is used to parse a yaml document into in-memory structs and vectors.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6057
static LLVM_ABI APFloat getAllOnesValue(const fltSemantics &Semantics)
Returns a float which is bitcasted from an all one value int.
Definition APFloat.cpp:6082
void clearSign()
Definition APFloat.h:1298
opStatus next(bool nextDown)
Definition APFloat.h:1254
void changeSign()
Definition APFloat.h:1297
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1079
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1406
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:449
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition APInt.h:229
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1670
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
LLVM_ABI uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition APInt.cpp:520
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1512
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:206
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
APInt abs() const
Get the absolute value.
Definition APInt.h:1795
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1666
void setSignBit()
Set the sign bit to 1.
Definition APInt.h:1340
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1111
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:209
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition APInt.h:216
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:329
bool intersects(const APInt &RHS) const
This operation tests if there are any pairs of corresponding bits between this APInt and RHS that are...
Definition APInt.h:1249
bool eq(const APInt &RHS) const
Equality comparison.
Definition APInt.h:1079
int32_t exactLogBase2() const
Definition APInt.h:1783
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1396
void ashrInPlace(unsigned ShiftAmt)
Arithmetic right-shift this APInt by ShiftAmt in place.
Definition APInt.h:834
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition APInt.h:435
unsigned getNumSignBits() const
Computes the number of leading bits of this APInt that are equal to its sign bit.
Definition APInt.h:1628
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1598
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:219
unsigned countTrailingZeros() const
Definition APInt.h:1647
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1531
void flipAllBits()
Toggle every bit to its opposite value.
Definition APInt.h:1452
unsigned countl_one() const
Count the number of leading one bits.
Definition APInt.h:1615
LLVM_ABI void insertBits(const APInt &SubBits, unsigned bitPosition)
Insert the bits from a smaller APInt starting at bitPosition.
Definition APInt.cpp:397
void clearLowBits(unsigned loBits)
Set bottom loBits bits to 0.
Definition APInt.h:1435
unsigned logBase2() const
Definition APInt.h:1761
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:827
void setAllBits()
Set every bit to 1.
Definition APInt.h:1319
bool getBoolValue() const
Convert APInt to a boolean value.
Definition APInt.h:471
bool isMask(unsigned numBits) const
Definition APInt.h:488
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition APInt.h:405
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition APInt.h:334
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1150
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1367
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition APInt.h:873
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition APInt.h:341
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:200
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1388
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition APInt.h:432
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:239
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:858
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:851
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1656
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
bool isMaxValue() const
Determine if this is the largest unsigned value.
Definition APInt.h:399
LLVM_ABI APInt truncSSat(unsigned width) const
Truncate to new width with signed saturation.
Definition APInt.cpp:973
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:183
iterator end() const
Definition ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition ArrayRef.h:206
iterator begin() const
Definition ArrayRef.h:135
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:191
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
This is an SDNode representing atomic operations.
LLVM_ABI StringRef getValueAsString() const
Return the attribute's value as a string.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
size_type count() const
count - Returns the number of bits which are set.
Definition BitVector.h:181
bool any() const
any - Returns true if any bit is set.
Definition BitVector.h:189
bool none() const
none - Returns true if none of the bits are set.
Definition BitVector.h:207
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool getRepeatedSequence(const APInt &DemandedElts, SmallVectorImpl< SDValue > &Sequence, BitVector *UndefElements=nullptr) const
Find the shortest repeating sequence of values in the build vector.
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI bool isConstant() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_SLT
signed less than
Definition InstrTypes.h:707
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:705
@ ICMP_NE
not equal
Definition InstrTypes.h:700
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:767
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static LLVM_ABI Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
const Constant * getConstVal() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
LLVM_ABI Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:237
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
iterator begin()
Definition DenseMap.h:78
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:222
Tagged union holding either a T or a Error.
Definition Error.h:485
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type::subtype_iterator param_iterator
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:706
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition Function.cpp:774
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
bool hasPersonalityFn() const
Check whether this function has a personality function.
Definition Function.h:903
Constant * getPersonalityFn() const
Get the personality function associated with this function.
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727
const GlobalValue * getGlobal() const
static StringRef dropLLVMManglingEscape(StringRef Name)
If the given string begins with the GlobalValue name mangling escape character '\1',...
LLVM_ABI bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition Globals.cpp:436
ThreadLocalMode getThreadLocalMode() const
Module * getParent()
Get the module that this global value is contained inside of...
This class is used to form a handle around another node that is persistent and is updated across invo...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
Class to represent integer types.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
bool usesWindowsCFI() const
Definition MCAsmInfo.h:652
LLVM_ABI MCSymbol * getOrCreateParentFrameOffsetSymbol(const Twine &FuncName)
LLVM_ABI MCSymbol * getOrCreateLSDASymbol(const Twine &FuncName)
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
Set of metadata that should be preserved when using BuildMI().
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool bitsGE(MVT VT) const
Return true if this has no less bits than VT.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
MVT getDoubleNumVectorElementsVT() const
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
MVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
void push_back(MachineInstr *MI)
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
succ_reverse_iterator succ_rbegin()
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
succ_reverse_iterator succ_rend()
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setReturnAddressIsTaken(bool s)
void setHasCopyImplyingStackAdjustment(bool B)
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
const WinEHFuncInfo * getWinEHFuncInfo() const
getWinEHFuncInfo - Return information about how the current function uses Windows exception handling.
void moveAdditionalCallInfo(const MachineInstr *Old, const MachineInstr *New)
Move the call site info from Old to \New call site info.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
bool shouldSplitStack() const
Should we be emitting segmented stack stuff for the function.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
bool killsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr kills the specified register.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_LabelDifference32
EK_LabelDifference32 - Each entry is the address of the block minus the address of the jump table.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This class is used to represent an MGATHER node.
This is a base class used to represent MGATHER and MSCATTER nodes.
This class is used to represent an MLOAD node.
This base class is used to represent MLOAD and MSTORE nodes.
const SDValue & getMask() const
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
This class is used to represent an MSTORE node.
bool isCompressingStore() const
Returns true if the op does a compression to the vector before storing.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID for this memory operation.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Root of the metadata hierarchy.
Definition Metadata.h:63
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:353
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:303
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isValid() const
Definition Register.h:107
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
SDNode * getGluedUser() const
If this node has a glue value with a user, return the user (there is at most one).
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
SDNodeFlags getFlags() const
TypeSize getValueSizeInBits(unsigned ResNo) const
Returns MVT::getSizeInBits(getValueType(ResNo)).
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
static LLVM_ABI bool areOnlyUsersOf(ArrayRef< const SDNode * > Nodes, const SDNode *N)
Return true if all the users of N are contained in Nodes.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
virtual bool isTargetStrictFPOpcode(unsigned Opcode) const
Returns true if a node with the given target-specific opcode has strict floating-point semantics.
Help to insert SDNodeFlags automatically in transforming.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
bool willNotOverflowAdd(bool IsSigned, SDValue N0, SDValue N1) const
Determine if the result of the addition of 2 nodes can never overflow.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)
Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
LLVM_ABI SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op)
Return the specified value casted to the target's desired shift amount type.
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
LLVM_ABI SDValue getSplatSourceVector(SDValue V, int &SplatIndex)
If V is a splatted value, return the source vector and its splat index.
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI SDValue getConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offs=0, bool isT=false, unsigned TargetFlags=0)
LLVM_ABI SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getExtractSubvector(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Return the VT typed sub-vector of Vec at Idx.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getInsertSubvector(const SDLoc &DL, SDValue Vec, SDValue SubVec, unsigned Idx)
Insert SubVec at the Idx element of Vec.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
LLVM_ABI bool shouldOptForSize() const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI bool isEqualTo(SDValue A, SDValue B) const
Test whether two SDValues are known to compare equal.
static constexpr unsigned MaxRecursionDepth
LLVM_ABI SDValue expandVACopy(SDNode *Node)
Expand the specified ISD::VACOPY node as the Legalize pass would.
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI std::optional< unsigned > getValidShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has a uniform shift amount that is less than the element bit-width of the shi...
LLVM_ABI SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal)
Try to simplify a select/vselect into 1 of its operands or a constant.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue expandVAArg(SDNode *Node)
Expand the specified ISD::VAARG node as the Legalize pass would.
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
LLVM_ABI bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV)
Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to the shuffle node in input but with swa...
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly=false, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, SDNodeFlags Flags=SDNodeFlags())
LLVM_ABI std::optional< unsigned > getValidMinimumShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has shift amounts that are all less than the element bit-width of the shift n...
LLVM_ABI SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI std::pair< SDValue, SDValue > getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT)
Convert Op, which must be a STRICT operation of float type, to the float type VT, by either extending...
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI bool isKnownNeverZeroFloat(SDValue Op) const
Test whether the given floating point SDValue is known to never be positive or negative zero.
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool MaskedVectorIsZero(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Return true if 'Op' is known to be zero in DemandedElts.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue getMCSymbol(MCSymbol *Sym, EVT VT)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
SDValue getSplat(EVT VT, const SDLoc &DL, SDValue Op)
Returns a node representing a splat of one value into all lanes of the provided vector type.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
static unsigned getOpcode_EXTEND(unsigned Opcode)
Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
LLVM_ABI SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, ArrayRef< ISD::NodeType > CandidateBinOps, bool AllowPartials=false)
Match a binop + shuffle pyramid that represents a horizontal reduction over the elements of a vector ...
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
static LLVM_ABI bool isBitRotateMask(ArrayRef< int > Mask, unsigned EltSizeInBits, unsigned MinSubElts, unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt)
Checks if the shuffle is a bit rotation of the first operand across multiple subelements,...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
static int getSplatMaskIndex(ArrayRef< int > Mask)
ArrayRef< int > getMask() const
static void commuteMask(MutableArrayRef< int > Mask)
Change values in a shuffle permute mask assuming the two vector operands have swapped position.
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
size_type size() const
Definition SmallSet.h:170
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
pointer data()
Return a pointer to the vector's buffer, even if empty().
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273
static constexpr size_t npos
Definition StringRef.h:57
bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
Definition StringRef.h:172
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
Information about stack frame layout on the target.
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
virtual bool hasAndNot(SDValue X) const
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
virtual bool isCommutativeBinOp(unsigned Opcode) const
Returns true if the opcode is a commutative binary operation.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const
Return true if it is profitable to fold a pair of shifts into a mask.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
BooleanContent getBooleanContents(bool isVec, bool isFloat) const
For targets without i1 registers, this gives the nature of the high-bits of boolean values held in ty...
virtual MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const
Returns preferred type for switch condition.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
BooleanContent
Enum that describes how the target represents true/false values.
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
AndOrSETCCFoldKind
Enum of different potentially desirable ways to fold (and/or (setcc ...), (setcc ....
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
NegatibleCost
Enum that specifies when a float negation is beneficial.
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual bool shouldConvertPhiType(Type *From, Type *To) const
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
static ISD::NodeType getExtendForContent(BooleanContent Content)
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const
Expands target specific indirect branch for the case of JumpTable expansion.
SDValue getCheaperNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, unsigned Depth=0) const
This is the helper function to return the newly negated expression only when the cost is cheaper.
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
Helper wrapper around SimplifyMultipleUseDemandedBits, demanding all bits from only some vector eleme...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual const char * LowerXConstraint(EVT ConstraintVT) const
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
virtual bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth=0) const
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const
Get a pointer to vector element Idx located in memory for a vector of type VecVT starting at a base a...
virtual unsigned combineRepeatedFPDivisors() const
Indicate whether this target prefers to combine FDIVs with the same divisor.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useTLSDESC() const
Returns true if this target uses TLS Descriptors.
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
unsigned NoSignedZerosFPMath
NoSignedZerosFPMath - This flag is enabled when the -enable-no-signed-zeros-fp-math is specified on t...
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetInstrInfo * getInstrInfo() const
Target - Wrapper for Target specific information.
bool isOSBinFormatCOFF() const
Tests whether the OS uses the COFF binary format.
Definition Triple.h:774
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
use_iterator use_begin()
Definition Value.h:364
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
bool Uses64BitFramePtr
True if the 64-bit frame or stack pointer should be used.
Register getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
void setAMXProgModel(AMXProgModelEnum Model)
ArrayRef< size_t > getPreallocatedArgOffsets(const size_t Id)
void setRestoreBasePointer(const MachineFunction *MF)
size_t getPreallocatedStackSize(const size_t Id)
bool hasBasePointer(const MachineFunction &MF) const
Register getPtrSizedFrameRegister(const MachineFunction &MF) const
Register getFrameRegister(const MachineFunction &MF) const override
Register getPtrSizedStackRegister(const MachineFunction &MF) const
Register getStackRegister() const
unsigned getSlotSize() const
Register getBaseRegister() const
const uint32_t * getNoPreservedMask() const override
bool canExtendTo512BW() const
bool hasAnyFMA() const
bool hasSSE1() const
bool avoidMFence() const
Avoid use of mfence forfence seq_cst, and instead use lock or.
bool hasBitScanPassThrough() const
bool hasSSE42() const
const X86TargetLowering * getTargetLowering() const override
bool hasMFence() const
Use mfence if we have SSE2 or we're on x86-64 (even if we asked for no-sse2).
bool canUseCMOV() const
bool isTargetDarwin() const
bool isTarget64BitLP64() const
Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
const X86InstrInfo * getInstrInfo() const override
bool useAVX512Regs() const
bool hasSSE3() const
bool isCallingConvWin64(CallingConv::ID CC) const
bool hasAVX512() const
bool canExtendTo512DQ() const
bool hasSSE41() const
bool hasSSE2() const
bool hasSSSE3() const
bool hasInt256() const
const X86RegisterInfo * getRegisterInfo() const override
bool hasAVX() const
unsigned getPreferVectorWidth() const
const X86FrameLowering * getFrameLowering() const override
bool useBWIRegs() const
bool hasAVX2() const
bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override
Overflow nodes should get combined/lowered to optimal instructions (they should allow eliminating exp...
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
bool isLegalAddImmediate(int64_t Imm) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool preferABDSToABSWithNSW(EVT VT) const override
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
std::pair< SDValue, SDValue > BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const
bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded vector elements, returning true on success...
SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override
Handle Lowering flag assembly outputs.
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const override
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth) const override
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
bool convertSelectOfConstantsToMath(EVT VT) const override
Return true if a select of constants (select Cond, C1, C2) should be transformed into simple math ops...
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint letter, return the type of constraint for this target.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool isLegalStoreImmediate(int64_t Imm) const override
Return true if the specified immediate is legal for the value input of a store instruction.
SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue Ptr, SDValue Val, SDValue Mask) const override
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isCtlzFast() const override
Return true if ctlz instruction is fast.
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool supportSwiftError() const override
Return true if the target supports swifterror attribute.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
bool shouldSplatInsEltVarIndex(EVT VT) const override
Return true if inserting a scalar into a variable element of an undef vector is more efficiently hand...
bool isInlineAsmTargetBranch(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo) const override
On x86, return true if the operand with index OpNo is a CALL or JUMP instruction, which can use eithe...
MVT hasFastEqualityCompare(unsigned NumBits) const override
Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, TargetLoweringOpt &TLO, unsigned Depth) const
bool isLegalICmpImmediate(int64_t Imm) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool hasInlineStackProbe(const MachineFunction &MF) const override
Returns true if stack probing through inline assembly is requested.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
unsigned preferedOpcodeForCmpEqPiecesOfOperand(EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional< APInt > &AndMask) const override
bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, EVT VT) const override
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset) const override
Return true if we believe it is correct and profitable to reduce the load node to a smaller type.
bool preferScalarizeSplat(SDNode *N) const override
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const override
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool hasAndNotCompare(SDValue Y) const override
Return true if the target should transform: (X & Y) == Y ---> (~X & Y) == 0 (X & Y) !...
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override
Return true if it is profitable to convert a select of FP constants into a constant pool load whose a...
StringRef getStackProbeSymbolName(const MachineFunction &MF) const override
Returns the name of the symbol used to emit stack probes or the empty string if not applicable.
bool hasBitTest(SDValue X, SDValue Y) const override
Return true if the target has a bit-test instruction: (X & (1 << Y)) ==/!= 0 This knowledge can be us...
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
bool isShuffleMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
bool useStackGuardXorFP() const override
If this function returns true, stack protection checks should XOR the frame pointer (or whichever poi...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine the number of bits in the operation that are sign bits.
bool shouldScalarizeBinop(SDValue) const override
Scalar ops always have equal or better analysis/performance/power than the vector equivalent,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override
Return true if it is profitable to transform an integer multiplication-by-constant into simpler opera...
bool areJTsAllowed(const Function *Fn) const override
Returns true if lowering to a jump table is allowed.
bool isCommutativeBinOp(unsigned Opcode) const override
Returns true if the opcode is a commutative binary operation.
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const override
Returns preferred type for switch condition.
SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const override
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isVectorClearMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Similar to isShuffleMaskLegal.
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info, const char *Constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Customize the preferred legalization strategy for certain types.
bool shouldConvertPhiType(Type *From, Type *To) const override
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool hasStackProbeSymbol(const MachineFunction &MF) const override
Returns true if stack probing through a function call is requested.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type Ty1 implicit zero-extends the valu...
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
This function returns true if the memory access is aligned or if the target allows this specific unal...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override
TargetLowering::AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const override
Return prefered fold type, Abs if this is a vector, AddAnd if its an integer, None otherwise.
bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override
There are two ways to clear extreme bits (either low or high): Mask: x & (-1 << y) (the instcombine c...
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool addressingModeSupportsTLS(const GlobalValue &GV) const override
Returns true if the targets addressing mode can target thread local storage (TLS).
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override
Expands target specific indirect branch for the case of JumpTable expansion.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isBinOp(unsigned Opcode) const override
Add x86-specific opcodes to the default list.
bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const override
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDValue unwrapAddress(SDValue N) const override
CondMergingParams getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const override
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the value type to use for ISD::SETCC.
X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI)
bool isTargetCanonicalSelect(SDNode *N) const override
Return true if the given select/vselect should be considered canonical and not be transformed.
bool isVectorLoadExtDesirable(SDValue) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const Constant * getTargetConstantFromLoad(LoadSDNode *LD) const override
This method returns the constant pool value that will be loaded by LD.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override
For types supported by the target, this is an identity function.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
unsigned getStackProbeSize(const MachineFunction &MF) const
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
Replace the results of node with an illegal result type with new values built out of custom code.
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
bool needsFixedCatchObjects() const override
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define INT64_MIN
Definition DataTypes.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3009
@ COND_NE
Not equal.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ X86_ThisCall
Similar to X86_StdCall.
@ X86_StdCall
stdcall is mostly used by the Win32 API.
Definition CallingConv.h:99
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ X86_FastCall
'fast' analog of X86_StdCall.
LLVM_ABI bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:163
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:525
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ FRAME_TO_ARGS_OFFSET
FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to first (possible) on-stack ar...
Definition ISDOpcodes.h:140
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:167
@ GlobalAddress
Definition ISDOpcodes.h:88
@ STRICT_FMINIMUM
Definition ISDOpcodes.h:464
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition ISDOpcodes.h:892
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:431
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ EH_RETURN
OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents 'eh_return' gcc dwarf builtin,...
Definition ISDOpcodes.h:151
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:706
@ STRICT_UINT_TO_FP
Definition ISDOpcodes.h:478
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition ISDOpcodes.h:117
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:809
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition ISDOpcodes.h:663
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ STRICT_FMAXIMUM
Definition ISDOpcodes.h:463
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition ISDOpcodes.h:130
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition ISDOpcodes.h:881
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition ISDOpcodes.h:477
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:457
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition ISDOpcodes.h:174
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:701
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:420
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition ISDOpcodes.h:236
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ExternalSymbol
Definition ISDOpcodes.h:93
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition ISDOpcodes.h:690
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:903
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:451
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:927
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:157
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:713
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
bool isExtVecInRegOpcode(unsigned Opcode)
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
LLVM_ABI bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
bool isTrueWhenEqual(CondCode Cond)
Return true if the specified condition returns true if the two operands to the condition are equal.
bool isUNINDEXEDLoad(const SDNode *N)
Returns true if the specified node is an unindexed load.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI bool isFreezeUndef(const SDNode *N)
Return true if the specified node is FREEZE(UNDEF).
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
bool matchUnaryPredicate(SDValue Op, std::function< bool(ConstantSDNode *)> Match, bool AllowUndefs=false, bool AllowTruncation=false)
Hook for matching ConstantSDNode predicate.
LLVM_ABI bool isBuildVectorOfConstantFPSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantFPSDNode or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isUnsignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs an unsigned comparison when used with intege...
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
cst_pred_ty< is_sign_mask > m_SignMask()
Match an integer or vector with only the sign bit(s) set.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
CmpClass_match< LHS, RHS, ICmpInst, true > m_c_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Xor, true > m_c_Xor(const LHS &L, const RHS &R)
Matches an Xor with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
AnyBinaryOp_match< LHS, RHS, true > m_c_BinOp(const LHS &L, const RHS &R)
Matches a BinaryOperator with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op > m_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Opcode_match m_Opc(unsigned Opcode)
BinaryOpc_match< LHS, RHS > m_Srl(const LHS &L, const RHS &R)
auto m_SpecificVT(EVT RefVT, const Pattern &P)
Match a specific ValueType.
TernaryOpc_match< LHS, RHS, IDX > m_InsertSubvector(const LHS &Base, const RHS &Sub, const IDX &Idx)
UnaryOpc_match< Opnd > m_Abs(const Opnd &Op)
Or< Preds... > m_AnyOf(const Preds &...preds)
And< Preds... > m_AllOf(const Preds &...preds)
TernaryOpc_match< T0_P, T1_P, T2_P > m_SetCC(const T0_P &LHS, const T1_P &RHS, const T2_P &CC)
UnaryOpc_match< Opnd > m_AnyExt(const Opnd &Op)
auto m_Node(unsigned Opcode, const OpndPreds &...preds)
TernaryOpc_match< T0_P, T1_P, T2_P > m_VSelect(const T0_P &Cond, const T1_P &T, const T2_P &F)
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
CondCode_match m_SpecificCondCode(ISD::CondCode CC)
Match a conditional code SDNode with a specific ISD::CondCode.
auto m_SpecificVectorElementVT(EVT RefVT, const Pattern &P)
Match a vector ValueType.
CondCode_match m_CondCode()
Match any conditional code SDNode.
ConstantInt_match m_ConstInt()
Match any integer constants or splat of an integer constant.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
Invariant opcodes: All instruction sets have these as their low opcodes.
@ X86
Windows x64, Windows Itanium (IA-64)
Definition MCAsmInfo.h:50
@ PTR32_UPTR
Definition X86.h:217
@ PTR64
Definition X86.h:218
@ PTR32_SPTR
Definition X86.h:216
@ MO_TLSLD
MO_TLSLD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
@ MO_GOTPCREL_NORELAX
MO_GOTPCREL_NORELAX - Same as MO_GOTPCREL except that R_X86_64_GOTPCREL relocations are guaranteed to...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ MO_NTPOFF
MO_NTPOFF - On a symbol operand this indicates that the immediate is the negative thread-pointer offs...
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
@ MO_TPOFF
MO_TPOFF - On a symbol operand this indicates that the immediate is the thread-pointer offset for the...
@ MO_TLVP_PIC_BASE
MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate is some TLS offset from the ...
@ MO_TLSGD
MO_TLSGD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
@ MO_TLVP
MO_TLVP - On a symbol operand this indicates that the immediate is some TLS offset.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the reference is actually to the "__imp...
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
@ MO_SECREL
MO_SECREL - On a symbol operand this indicates that the immediate is the offset from beginning of sec...
@ MO_DTPOFF
MO_DTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
@ MO_TLSLDM
MO_TLSLDM - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FMAX
Floating point max and min.
@ BT
X86 bit-test instructions.
@ HADD
Integer horizontal add/sub.
@ MOVQ2DQ
Copies a 64-bit value from an MMX vector to the low word of an XMM vector, with the high word zero fi...
@ BLENDI
Blend where the selector is an immediate.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ ADDSUB
Combined add and sub on an FP vector.
@ STRICT_FMAX
Floating point max and min.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FHADD
Floating point horizontal add/sub.
@ BSR
Bit scan reverse.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ PEXTRB
Extract an 8-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRB.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ FSETCC
X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
@ PINSRB
Insert the lower 8-bits of a 32-bit value to a vector, corresponds to X86::PINSRB.
@ INSERTPS
Insert any element of a 4 x float vector into any element of a destination 4 x floatvector.
@ PSHUFB
Shuffle 16 8-bit values within a vector.
@ PEXTRW
Extract a 16-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRW.
@ AADD
RAO arithmetic instructions.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FMAXC
Commutative FMIN and FMAX.
@ EXTRQI
SSE4A Extraction and Insertion.
@ FLD
This instruction implements an extending load to FP stack slots.
@ PSADBW
Compute Sum of Absolute Differences.
@ FOR
Bitwise logical OR of floating point values.
@ FIST
This instruction implements a fp->int store from FP stack slots.
@ FP_TO_INT_IN_MEM
This instruction implements FP_TO_SINT with the integer destination in memory and a FP reg source.
@ LADD
LOCK-prefixed arithmetic read-modify-write instructions.
@ MMX_MOVW2D
Copies a GPR into the low 32-bit word of a MMX vector and zero out the high word.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ PINSRW
Insert the lower 16-bits of a 32-bit value to a vector, corresponds to X86::PINSRW.
@ CMPCCXADD
Compare and Add if Condition is Met.
@ MMX_MOVD2W
Copies a 32-bit value from the low word of a MMX vector to a GPR.
@ FILD
This instruction implements SINT_TO_FP with the integer source in memory and FP reg result.
@ MOVDQ2Q
Copies a 64-bit value from the low word of an XMM vector to an MMX vector.
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ BSF
Bit scan forward.
@ VAARG_64
These instructions grab the address of the next argument from a va_list.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
@ FSHL
X86 funnel/double shift i16 instructions.
@ FRSQRT
Floating point reciprocal-sqrt and reciprocal approximation.
Define some predicates that are used for node matching.
@ AddrNumOperands
Definition X86BaseInfo.h:36
bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into a vector splat instruction as a memory oper...
bool isZeroNode(SDValue Elt)
Returns true if Elt is a constant zero or floating point constant +0.0.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
bool mayFoldIntoZeroExtend(SDValue Op)
Check if Op is an operation that could be folded into a zero extend x86 instruction.
bool mayFoldIntoStore(SDValue Op)
Check if Op is a value that could be used to fold a store into some other x86 instruction as a memory...
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF)
True if the target supports the extended frame for async Swift functions.
int getRoundingModeX86(unsigned RM)
Convert LLVM rounding mode to X86 rounding mode.
int getCCMPCondFlagsFromCondCode(CondCode CC)
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
constexpr double e
Definition MathExtras.h:47
@ User
could "use" a pointer
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl< int > &ShuffleMask)
Decode a zero extension instruction as a shuffle mask.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
static bool isGlobalStubReference(unsigned char TargetFlag)
isGlobalStubReference - Return true if the specified TargetFlag operand is a reference to a stub for ...
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:262
@ Offset
Definition DWP.cpp:477
@ Length
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1731
static bool isGlobalRelativeToPICBase(unsigned char TargetFlag)
isGlobalRelativeToPICBase - Return true if the specified global value reference is relative to a 32-b...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1657
void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decode a move lower and zero upper instruction as a shuffle mask.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI bool isAllOnesOrAllOnesSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant -1 integer or a splatted vector of a constant -1 integer (with...
Definition Utils.cpp:1607
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshuflw.
static const IntrinsicData * getIntrinsicWithChain(unsigned IntNo)
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:361
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
@ SjLj
setjmp/longjmp based exceptions
Definition CodeGen.h:56
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
void DecodeVPERMV3Mask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
void DecodeBLENDMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a BLEND immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a shuffle packed values at 128-bit granularity (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) immed...
void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for VPERMQ/VPERMPD.
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A EXTRQ instruction as a shuffle mask.
static const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
static const IntrinsicData * getIntrinsicWithoutChain(unsigned IntNo)
auto unique(Range &&R, Predicate P)
Definition STLExtras.h:2056
LLVM_ABI bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition Utils.cpp:1589
void DecodePSRLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:314
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1757
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:348
LLVM_ABI bool isMinSignedConstant(SDValue V)
Returns true if V is a constant min signed integer value.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask, bool SrcIsMem)
Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
static void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, Register Reg)
Replace the address used in the instruction with the direct memory reference.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:759
void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
LLVM_ABI bool getShuffleDemandedElts(int SrcWidth, ArrayRef< int > Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts=false)
Transform a shuffle mask's output demanded element mask into demanded element masks for the 2 operand...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:222
bool isAlpha(char C)
Checks if character C is a valid letter as classified by "C" locale.
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
LLVM_ABI void getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
Compute the demanded elements mask of horizontal binary operations.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI SDValue peekThroughTruncates(SDValue V)
Return the non-truncated source operand of V if it exists.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1719
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A INSERTQ instruction as a shuffle mask.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
LLVM_ABI EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
@ Default
-O2, -Os
Definition CodeGen.h:85
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
@ Success
The lock was released successfully.
void DecodeVPERMVMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
static void verifyIntrinsicTables()
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Lo)
Similar to unpacklo/unpackhi, but without the 128-bit lane limitation imposed by AVX and specific to ...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
bool isFuncletEHPersonality(EHPersonality Pers)
Returns true if this is a personality function that invokes handler funclets (which must return to it...
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
CombineLevel
Definition DAGCombine.h:15
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:1974
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
To bit_cast(const From &from) noexcept
Definition bit.h:90
void replace(R &&Range, const T &OldValue, const T &NewValue)
Provide wrappers to std::replace which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1840
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Xor
Bitwise or logical XOR of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, SmallVectorImpl< int > &ShuffleMask)
Decode a scalar float move instruction as a shuffle mask.
LLVM_ABI bool isNullConstantOrUndef(SDValue V)
Returns true if V is a constant integer zero or an UNDEF node.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1934
static X86AddressMode getAddressFromInstr(const MachineInstr *MI, unsigned Operand)
Compute the addressing mode from an machine instruction starting with the given operand.
void DecodeVPPERMMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPPERM mask from a raw array of constants such as from BUILD_VECTOR.
DWARFExpression::Operation Op
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
bool isPhysRegUsedAfter(Register Reg, MachineBasicBlock::iterator MBI)
Check if physical register Reg is used after MBI.
void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
RoundingMode
Rounding mode.
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1815
constexpr unsigned BitWidth
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
void DecodePSLLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1941
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
static const MachineInstrBuilder & addDirectMem(const MachineInstrBuilder &MIB, Register Reg)
addDirectMem - This function is used to add a direct memory reference to the current instruction – th...
static uint32_t extractBits(uint64_t Val, uint32_t Hi, uint32_t Lo)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2088
@ TRUNCATE_TO_MEM_VI16
@ INTR_TYPE_SCALAR_MASK_SAE
@ INTR_TYPE_1OP_SAE
@ TRUNCATE_TO_MEM_VI32
@ INTR_TYPE_2OP_SAE
@ INTR_TYPE_3OP_SCALAR_MASK_SAE
@ INTR_TYPE_3OP_MASK_SAE
@ INTR_TYPE_2OP_MASK
@ TRUNCATE_TO_MEM_VI8
@ CVTNEPS2BF16_MASK
@ CMP_MASK_SCALAR_CC
@ INTR_TYPE_1OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK
@ INTR_TYPE_3OP_IMM8
@ INTR_TYPE_2OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK_RND
@ INTR_TYPE_1OP_MASK
@ COMPRESS_EXPAND_IN_REG
@ INTR_TYPE_CAST_MMX
@ INTR_TYPE_4OP_IMM8
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void array_pod_sort(IteratorTy Start, IteratorTy End)
array_pod_sort - This sorts an array with the specified start and end extent.
Definition STLExtras.h:1584
void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of the first element of a vector.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:299
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for shufp*.
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufhw.
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, Register Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
@ SM_SentinelUndef
@ SM_SentinelZero
LLVM_ABI bool scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Attempt to narrow/widen the Mask shuffle mask to the NumDstElts target width.
void DecodePSHUFBMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a PSHUFB mask from a raw array of constants such as from BUILD_VECTOR.
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
#define EQ(a, b)
Definition regexec.c:65
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition APFloat.cpp:266
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static constexpr roundingMode rmTowardZero
Definition APFloat.h:308
static LLVM_ABI const fltSemantics & x87DoubleExtended() LLVM_READNONE
Definition APFloat.cpp:289
static LLVM_ABI const fltSemantics & IEEEquad() LLVM_READNONE
Definition APFloat.cpp:268
static LLVM_ABI unsigned int semanticsPrecision(const fltSemantics &)
Definition APFloat.cpp:324
static LLVM_ABI const fltSemantics & IEEEdouble() LLVM_READNONE
Definition APFloat.cpp:267
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
static LLVM_ABI const fltSemantics & BFloat() LLVM_READNONE
Definition APFloat.cpp:265
opStatus
IEEE-754R 7: Default exception handling.
Definition APFloat.h:320
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool is512BitVector() const
Return true if this is a 512-bit vector type.
Definition ValueTypes.h:217
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:212
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
static LLVM_ABI KnownBits sadd_sat(const KnownBits &LHS, const KnownBits &RHS)
Compute knownbits resulting from llvm.sadd.sat(LHS, RHS)
static LLVM_ABI std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
KnownBits anyextOrTrunc(unsigned BitWidth) const
Return known bits for an "any" extension or truncation of the value we're tracking.
Definition KnownBits.h:186
static LLVM_ABI KnownBits mulhu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits from zero-extended multiply-hi.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:242
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
unsigned countMaxTrailingZeros() const
Returns the maximum number of trailing zero bits possible.
Definition KnownBits.h:274
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:161
unsigned countMaxPopulation() const
Returns the maximum number of bits that could be one.
Definition KnownBits.h:289
void setAllZero()
Make all bits known to be zero and discard any previous information.
Definition KnownBits.h:86
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
bool isNonZero() const
Returns true if this value is known to be non-zero.
Definition KnownBits.h:111
static LLVM_ABI KnownBits abdu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits for abdu(LHS, RHS).
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:225
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:311
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
KnownBits zextOrTrunc(unsigned BitWidth) const
Return known bits for a zero extension or truncation of the value we're tracking.
Definition KnownBits.h:196
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145
static LLVM_ABI KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition KnownBits.cpp:60
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
void setAllOnes()
Make all bits known to be one and discard any previous information.
Definition KnownBits.h:92
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
static LLVM_ABI std::optional< bool > sgt(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_SGT result.
bool isAllOnes() const
Returns true if value is all one bits.
Definition KnownBits.h:83
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition KnownBits.h:60
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasAllowContract() const
bool hasNoSignedZeros() const
void setNoSignedWrap(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setChain(SDValue InChain)
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI bool recursivelyDeleteUnusedNodes(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
This structure is used to pass arguments to makeLibCall function.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
X86AddressMode - This struct holds a generalized full x86 address mode.